1/*
2 * ChaCha/XChaCha NEON helper functions
3 *
4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Originally based on:
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12 *
13 * Copyright (C) 2015 Martin Willi
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 */
20
21#include <linux/linkage.h>
22#include <asm/assembler.h>
23#include <asm/cache.h>
24
25	.text
26	.align		6
27
28/*
29 * chacha_permute - permute one block
30 *
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3.  It performs matrix operations on four words in parallel,
33 * but requires shuffling to rearrange the words after each round.
34 *
35 * The round count is given in w3.
36 *
37 * Clobbers: w3, x10, v4, v12
38 */
39chacha_permute:
40
41	adr_l		x10, ROT8
42	ld1		{v12.4s}, [x10]
43
44.Ldoubleround:
45	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46	add		v0.4s, v0.4s, v1.4s
47	eor		v3.16b, v3.16b, v0.16b
48	rev32		v3.8h, v3.8h
49
50	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51	add		v2.4s, v2.4s, v3.4s
52	eor		v4.16b, v1.16b, v2.16b
53	shl		v1.4s, v4.4s, #12
54	sri		v1.4s, v4.4s, #20
55
56	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57	add		v0.4s, v0.4s, v1.4s
58	eor		v3.16b, v3.16b, v0.16b
59	tbl		v3.16b, {v3.16b}, v12.16b
60
61	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62	add		v2.4s, v2.4s, v3.4s
63	eor		v4.16b, v1.16b, v2.16b
64	shl		v1.4s, v4.4s, #7
65	sri		v1.4s, v4.4s, #25
66
67	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68	ext		v1.16b, v1.16b, v1.16b, #4
69	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70	ext		v2.16b, v2.16b, v2.16b, #8
71	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72	ext		v3.16b, v3.16b, v3.16b, #12
73
74	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75	add		v0.4s, v0.4s, v1.4s
76	eor		v3.16b, v3.16b, v0.16b
77	rev32		v3.8h, v3.8h
78
79	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80	add		v2.4s, v2.4s, v3.4s
81	eor		v4.16b, v1.16b, v2.16b
82	shl		v1.4s, v4.4s, #12
83	sri		v1.4s, v4.4s, #20
84
85	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86	add		v0.4s, v0.4s, v1.4s
87	eor		v3.16b, v3.16b, v0.16b
88	tbl		v3.16b, {v3.16b}, v12.16b
89
90	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91	add		v2.4s, v2.4s, v3.4s
92	eor		v4.16b, v1.16b, v2.16b
93	shl		v1.4s, v4.4s, #7
94	sri		v1.4s, v4.4s, #25
95
96	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97	ext		v1.16b, v1.16b, v1.16b, #12
98	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99	ext		v2.16b, v2.16b, v2.16b, #8
100	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101	ext		v3.16b, v3.16b, v3.16b, #4
102
103	subs		w3, w3, #2
104	b.ne		.Ldoubleround
105
106	ret
107ENDPROC(chacha_permute)
108
109ENTRY(chacha_block_xor_neon)
110	// x0: Input state matrix, s
111	// x1: 1 data block output, o
112	// x2: 1 data block input, i
113	// w3: nrounds
114
115	stp		x29, x30, [sp, #-16]!
116	mov		x29, sp
117
118	// x0..3 = s0..3
119	ld1		{v0.4s-v3.4s}, [x0]
120	ld1		{v8.4s-v11.4s}, [x0]
121
122	bl		chacha_permute
123
124	ld1		{v4.16b-v7.16b}, [x2]
125
126	// o0 = i0 ^ (x0 + s0)
127	add		v0.4s, v0.4s, v8.4s
128	eor		v0.16b, v0.16b, v4.16b
129
130	// o1 = i1 ^ (x1 + s1)
131	add		v1.4s, v1.4s, v9.4s
132	eor		v1.16b, v1.16b, v5.16b
133
134	// o2 = i2 ^ (x2 + s2)
135	add		v2.4s, v2.4s, v10.4s
136	eor		v2.16b, v2.16b, v6.16b
137
138	// o3 = i3 ^ (x3 + s3)
139	add		v3.4s, v3.4s, v11.4s
140	eor		v3.16b, v3.16b, v7.16b
141
142	st1		{v0.16b-v3.16b}, [x1]
143
144	ldp		x29, x30, [sp], #16
145	ret
146ENDPROC(chacha_block_xor_neon)
147
148ENTRY(hchacha_block_neon)
149	// x0: Input state matrix, s
150	// x1: output (8 32-bit words)
151	// w2: nrounds
152
153	stp		x29, x30, [sp, #-16]!
154	mov		x29, sp
155
156	ld1		{v0.4s-v3.4s}, [x0]
157
158	mov		w3, w2
159	bl		chacha_permute
160
161	st1		{v0.4s}, [x1], #16
162	st1		{v3.4s}, [x1]
163
164	ldp		x29, x30, [sp], #16
165	ret
166ENDPROC(hchacha_block_neon)
167
168	a0		.req	w12
169	a1		.req	w13
170	a2		.req	w14
171	a3		.req	w15
172	a4		.req	w16
173	a5		.req	w17
174	a6		.req	w19
175	a7		.req	w20
176	a8		.req	w21
177	a9		.req	w22
178	a10		.req	w23
179	a11		.req	w24
180	a12		.req	w25
181	a13		.req	w26
182	a14		.req	w27
183	a15		.req	w28
184
185	.align		6
186ENTRY(chacha_4block_xor_neon)
187	frame_push	10
188
189	// x0: Input state matrix, s
190	// x1: 4 data blocks output, o
191	// x2: 4 data blocks input, i
192	// w3: nrounds
193	// x4: byte count
194
195	adr_l		x10, .Lpermute
196	and		x5, x4, #63
197	add		x10, x10, x5
198	add		x11, x10, #64
199
200	//
201	// This function encrypts four consecutive ChaCha blocks by loading
202	// the state matrix in NEON registers four times. The algorithm performs
203	// each operation on the corresponding word of each state matrix, hence
204	// requires no word shuffling. For final XORing step we transpose the
205	// matrix by interleaving 32- and then 64-bit words, which allows us to
206	// do XOR in NEON registers.
207	//
208	// At the same time, a fifth block is encrypted in parallel using
209	// scalar registers
210	//
211	adr_l		x9, CTRINC		// ... and ROT8
212	ld1		{v30.4s-v31.4s}, [x9]
213
214	// x0..15[0-3] = s0..3[0..3]
215	add		x8, x0, #16
216	ld4r		{ v0.4s- v3.4s}, [x0]
217	ld4r		{ v4.4s- v7.4s}, [x8], #16
218	ld4r		{ v8.4s-v11.4s}, [x8], #16
219	ld4r		{v12.4s-v15.4s}, [x8]
220
221	mov		a0, v0.s[0]
222	mov		a1, v1.s[0]
223	mov		a2, v2.s[0]
224	mov		a3, v3.s[0]
225	mov		a4, v4.s[0]
226	mov		a5, v5.s[0]
227	mov		a6, v6.s[0]
228	mov		a7, v7.s[0]
229	mov		a8, v8.s[0]
230	mov		a9, v9.s[0]
231	mov		a10, v10.s[0]
232	mov		a11, v11.s[0]
233	mov		a12, v12.s[0]
234	mov		a13, v13.s[0]
235	mov		a14, v14.s[0]
236	mov		a15, v15.s[0]
237
238	// x12 += counter values 1-4
239	add		v12.4s, v12.4s, v30.4s
240
241.Ldoubleround4:
242	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
243	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
244	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
245	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
246	add		v0.4s, v0.4s, v4.4s
247	  add		a0, a0, a4
248	add		v1.4s, v1.4s, v5.4s
249	  add		a1, a1, a5
250	add		v2.4s, v2.4s, v6.4s
251	  add		a2, a2, a6
252	add		v3.4s, v3.4s, v7.4s
253	  add		a3, a3, a7
254
255	eor		v12.16b, v12.16b, v0.16b
256	  eor		a12, a12, a0
257	eor		v13.16b, v13.16b, v1.16b
258	  eor		a13, a13, a1
259	eor		v14.16b, v14.16b, v2.16b
260	  eor		a14, a14, a2
261	eor		v15.16b, v15.16b, v3.16b
262	  eor		a15, a15, a3
263
264	rev32		v12.8h, v12.8h
265	  ror		a12, a12, #16
266	rev32		v13.8h, v13.8h
267	  ror		a13, a13, #16
268	rev32		v14.8h, v14.8h
269	  ror		a14, a14, #16
270	rev32		v15.8h, v15.8h
271	  ror		a15, a15, #16
272
273	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
274	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
275	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
276	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
277	add		v8.4s, v8.4s, v12.4s
278	  add		a8, a8, a12
279	add		v9.4s, v9.4s, v13.4s
280	  add		a9, a9, a13
281	add		v10.4s, v10.4s, v14.4s
282	  add		a10, a10, a14
283	add		v11.4s, v11.4s, v15.4s
284	  add		a11, a11, a15
285
286	eor		v16.16b, v4.16b, v8.16b
287	  eor		a4, a4, a8
288	eor		v17.16b, v5.16b, v9.16b
289	  eor		a5, a5, a9
290	eor		v18.16b, v6.16b, v10.16b
291	  eor		a6, a6, a10
292	eor		v19.16b, v7.16b, v11.16b
293	  eor		a7, a7, a11
294
295	shl		v4.4s, v16.4s, #12
296	shl		v5.4s, v17.4s, #12
297	shl		v6.4s, v18.4s, #12
298	shl		v7.4s, v19.4s, #12
299
300	sri		v4.4s, v16.4s, #20
301	  ror		a4, a4, #20
302	sri		v5.4s, v17.4s, #20
303	  ror		a5, a5, #20
304	sri		v6.4s, v18.4s, #20
305	  ror		a6, a6, #20
306	sri		v7.4s, v19.4s, #20
307	  ror		a7, a7, #20
308
309	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
310	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
311	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
312	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
313	add		v0.4s, v0.4s, v4.4s
314	  add		a0, a0, a4
315	add		v1.4s, v1.4s, v5.4s
316	  add		a1, a1, a5
317	add		v2.4s, v2.4s, v6.4s
318	  add		a2, a2, a6
319	add		v3.4s, v3.4s, v7.4s
320	  add		a3, a3, a7
321
322	eor		v12.16b, v12.16b, v0.16b
323	  eor		a12, a12, a0
324	eor		v13.16b, v13.16b, v1.16b
325	  eor		a13, a13, a1
326	eor		v14.16b, v14.16b, v2.16b
327	  eor		a14, a14, a2
328	eor		v15.16b, v15.16b, v3.16b
329	  eor		a15, a15, a3
330
331	tbl		v12.16b, {v12.16b}, v31.16b
332	  ror		a12, a12, #24
333	tbl		v13.16b, {v13.16b}, v31.16b
334	  ror		a13, a13, #24
335	tbl		v14.16b, {v14.16b}, v31.16b
336	  ror		a14, a14, #24
337	tbl		v15.16b, {v15.16b}, v31.16b
338	  ror		a15, a15, #24
339
340	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
341	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
342	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
343	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
344	add		v8.4s, v8.4s, v12.4s
345	  add		a8, a8, a12
346	add		v9.4s, v9.4s, v13.4s
347	  add		a9, a9, a13
348	add		v10.4s, v10.4s, v14.4s
349	  add		a10, a10, a14
350	add		v11.4s, v11.4s, v15.4s
351	  add		a11, a11, a15
352
353	eor		v16.16b, v4.16b, v8.16b
354	  eor		a4, a4, a8
355	eor		v17.16b, v5.16b, v9.16b
356	  eor		a5, a5, a9
357	eor		v18.16b, v6.16b, v10.16b
358	  eor		a6, a6, a10
359	eor		v19.16b, v7.16b, v11.16b
360	  eor		a7, a7, a11
361
362	shl		v4.4s, v16.4s, #7
363	shl		v5.4s, v17.4s, #7
364	shl		v6.4s, v18.4s, #7
365	shl		v7.4s, v19.4s, #7
366
367	sri		v4.4s, v16.4s, #25
368	  ror		a4, a4, #25
369	sri		v5.4s, v17.4s, #25
370	  ror		a5, a5, #25
371	sri		v6.4s, v18.4s, #25
372	 ror		a6, a6, #25
373	sri		v7.4s, v19.4s, #25
374	  ror		a7, a7, #25
375
376	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
377	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
378	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
379	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
380	add		v0.4s, v0.4s, v5.4s
381	  add		a0, a0, a5
382	add		v1.4s, v1.4s, v6.4s
383	  add		a1, a1, a6
384	add		v2.4s, v2.4s, v7.4s
385	  add		a2, a2, a7
386	add		v3.4s, v3.4s, v4.4s
387	  add		a3, a3, a4
388
389	eor		v15.16b, v15.16b, v0.16b
390	  eor		a15, a15, a0
391	eor		v12.16b, v12.16b, v1.16b
392	  eor		a12, a12, a1
393	eor		v13.16b, v13.16b, v2.16b
394	  eor		a13, a13, a2
395	eor		v14.16b, v14.16b, v3.16b
396	  eor		a14, a14, a3
397
398	rev32		v15.8h, v15.8h
399	  ror		a15, a15, #16
400	rev32		v12.8h, v12.8h
401	  ror		a12, a12, #16
402	rev32		v13.8h, v13.8h
403	  ror		a13, a13, #16
404	rev32		v14.8h, v14.8h
405	  ror		a14, a14, #16
406
407	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
408	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
409	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
410	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
411	add		v10.4s, v10.4s, v15.4s
412	  add		a10, a10, a15
413	add		v11.4s, v11.4s, v12.4s
414	  add		a11, a11, a12
415	add		v8.4s, v8.4s, v13.4s
416	  add		a8, a8, a13
417	add		v9.4s, v9.4s, v14.4s
418	  add		a9, a9, a14
419
420	eor		v16.16b, v5.16b, v10.16b
421	  eor		a5, a5, a10
422	eor		v17.16b, v6.16b, v11.16b
423	  eor		a6, a6, a11
424	eor		v18.16b, v7.16b, v8.16b
425	  eor		a7, a7, a8
426	eor		v19.16b, v4.16b, v9.16b
427	  eor		a4, a4, a9
428
429	shl		v5.4s, v16.4s, #12
430	shl		v6.4s, v17.4s, #12
431	shl		v7.4s, v18.4s, #12
432	shl		v4.4s, v19.4s, #12
433
434	sri		v5.4s, v16.4s, #20
435	  ror		a5, a5, #20
436	sri		v6.4s, v17.4s, #20
437	  ror		a6, a6, #20
438	sri		v7.4s, v18.4s, #20
439	  ror		a7, a7, #20
440	sri		v4.4s, v19.4s, #20
441	  ror		a4, a4, #20
442
443	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
444	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
445	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
446	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
447	add		v0.4s, v0.4s, v5.4s
448	  add		a0, a0, a5
449	add		v1.4s, v1.4s, v6.4s
450	  add		a1, a1, a6
451	add		v2.4s, v2.4s, v7.4s
452	  add		a2, a2, a7
453	add		v3.4s, v3.4s, v4.4s
454	  add		a3, a3, a4
455
456	eor		v15.16b, v15.16b, v0.16b
457	  eor		a15, a15, a0
458	eor		v12.16b, v12.16b, v1.16b
459	  eor		a12, a12, a1
460	eor		v13.16b, v13.16b, v2.16b
461	  eor		a13, a13, a2
462	eor		v14.16b, v14.16b, v3.16b
463	  eor		a14, a14, a3
464
465	tbl		v15.16b, {v15.16b}, v31.16b
466	  ror		a15, a15, #24
467	tbl		v12.16b, {v12.16b}, v31.16b
468	  ror		a12, a12, #24
469	tbl		v13.16b, {v13.16b}, v31.16b
470	  ror		a13, a13, #24
471	tbl		v14.16b, {v14.16b}, v31.16b
472	  ror		a14, a14, #24
473
474	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
475	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
476	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
477	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
478	add		v10.4s, v10.4s, v15.4s
479	  add		a10, a10, a15
480	add		v11.4s, v11.4s, v12.4s
481	  add		a11, a11, a12
482	add		v8.4s, v8.4s, v13.4s
483	  add		a8, a8, a13
484	add		v9.4s, v9.4s, v14.4s
485	  add		a9, a9, a14
486
487	eor		v16.16b, v5.16b, v10.16b
488	  eor		a5, a5, a10
489	eor		v17.16b, v6.16b, v11.16b
490	  eor		a6, a6, a11
491	eor		v18.16b, v7.16b, v8.16b
492	  eor		a7, a7, a8
493	eor		v19.16b, v4.16b, v9.16b
494	  eor		a4, a4, a9
495
496	shl		v5.4s, v16.4s, #7
497	shl		v6.4s, v17.4s, #7
498	shl		v7.4s, v18.4s, #7
499	shl		v4.4s, v19.4s, #7
500
501	sri		v5.4s, v16.4s, #25
502	  ror		a5, a5, #25
503	sri		v6.4s, v17.4s, #25
504	  ror		a6, a6, #25
505	sri		v7.4s, v18.4s, #25
506	  ror		a7, a7, #25
507	sri		v4.4s, v19.4s, #25
508	  ror		a4, a4, #25
509
510	subs		w3, w3, #2
511	b.ne		.Ldoubleround4
512
513	ld4r		{v16.4s-v19.4s}, [x0], #16
514	ld4r		{v20.4s-v23.4s}, [x0], #16
515
516	// x12 += counter values 0-3
517	add		v12.4s, v12.4s, v30.4s
518
519	// x0[0-3] += s0[0]
520	// x1[0-3] += s0[1]
521	// x2[0-3] += s0[2]
522	// x3[0-3] += s0[3]
523	add		v0.4s, v0.4s, v16.4s
524	  mov		w6, v16.s[0]
525	  mov		w7, v17.s[0]
526	add		v1.4s, v1.4s, v17.4s
527	  mov		w8, v18.s[0]
528	  mov		w9, v19.s[0]
529	add		v2.4s, v2.4s, v18.4s
530	  add		a0, a0, w6
531	  add		a1, a1, w7
532	add		v3.4s, v3.4s, v19.4s
533	  add		a2, a2, w8
534	  add		a3, a3, w9
535CPU_BE(	  rev		a0, a0		)
536CPU_BE(	  rev		a1, a1		)
537CPU_BE(	  rev		a2, a2		)
538CPU_BE(	  rev		a3, a3		)
539
540	ld4r		{v24.4s-v27.4s}, [x0], #16
541	ld4r		{v28.4s-v31.4s}, [x0]
542
543	// x4[0-3] += s1[0]
544	// x5[0-3] += s1[1]
545	// x6[0-3] += s1[2]
546	// x7[0-3] += s1[3]
547	add		v4.4s, v4.4s, v20.4s
548	  mov		w6, v20.s[0]
549	  mov		w7, v21.s[0]
550	add		v5.4s, v5.4s, v21.4s
551	  mov		w8, v22.s[0]
552	  mov		w9, v23.s[0]
553	add		v6.4s, v6.4s, v22.4s
554	  add		a4, a4, w6
555	  add		a5, a5, w7
556	add		v7.4s, v7.4s, v23.4s
557	  add		a6, a6, w8
558	  add		a7, a7, w9
559CPU_BE(	  rev		a4, a4		)
560CPU_BE(	  rev		a5, a5		)
561CPU_BE(	  rev		a6, a6		)
562CPU_BE(	  rev		a7, a7		)
563
564	// x8[0-3] += s2[0]
565	// x9[0-3] += s2[1]
566	// x10[0-3] += s2[2]
567	// x11[0-3] += s2[3]
568	add		v8.4s, v8.4s, v24.4s
569	  mov		w6, v24.s[0]
570	  mov		w7, v25.s[0]
571	add		v9.4s, v9.4s, v25.4s
572	  mov		w8, v26.s[0]
573	  mov		w9, v27.s[0]
574	add		v10.4s, v10.4s, v26.4s
575	  add		a8, a8, w6
576	  add		a9, a9, w7
577	add		v11.4s, v11.4s, v27.4s
578	  add		a10, a10, w8
579	  add		a11, a11, w9
580CPU_BE(	  rev		a8, a8		)
581CPU_BE(	  rev		a9, a9		)
582CPU_BE(	  rev		a10, a10	)
583CPU_BE(	  rev		a11, a11	)
584
585	// x12[0-3] += s3[0]
586	// x13[0-3] += s3[1]
587	// x14[0-3] += s3[2]
588	// x15[0-3] += s3[3]
589	add		v12.4s, v12.4s, v28.4s
590	  mov		w6, v28.s[0]
591	  mov		w7, v29.s[0]
592	add		v13.4s, v13.4s, v29.4s
593	  mov		w8, v30.s[0]
594	  mov		w9, v31.s[0]
595	add		v14.4s, v14.4s, v30.4s
596	  add		a12, a12, w6
597	  add		a13, a13, w7
598	add		v15.4s, v15.4s, v31.4s
599	  add		a14, a14, w8
600	  add		a15, a15, w9
601CPU_BE(	  rev		a12, a12	)
602CPU_BE(	  rev		a13, a13	)
603CPU_BE(	  rev		a14, a14	)
604CPU_BE(	  rev		a15, a15	)
605
606	// interleave 32-bit words in state n, n+1
607	  ldp		w6, w7, [x2], #64
608	zip1		v16.4s, v0.4s, v1.4s
609	  ldp		w8, w9, [x2, #-56]
610	  eor		a0, a0, w6
611	zip2		v17.4s, v0.4s, v1.4s
612	  eor		a1, a1, w7
613	zip1		v18.4s, v2.4s, v3.4s
614	  eor		a2, a2, w8
615	zip2		v19.4s, v2.4s, v3.4s
616	  eor		a3, a3, w9
617	  ldp		w6, w7, [x2, #-48]
618	zip1		v20.4s, v4.4s, v5.4s
619	  ldp		w8, w9, [x2, #-40]
620	  eor		a4, a4, w6
621	zip2		v21.4s, v4.4s, v5.4s
622	  eor		a5, a5, w7
623	zip1		v22.4s, v6.4s, v7.4s
624	  eor		a6, a6, w8
625	zip2		v23.4s, v6.4s, v7.4s
626	  eor		a7, a7, w9
627	  ldp		w6, w7, [x2, #-32]
628	zip1		v24.4s, v8.4s, v9.4s
629	  ldp		w8, w9, [x2, #-24]
630	  eor		a8, a8, w6
631	zip2		v25.4s, v8.4s, v9.4s
632	  eor		a9, a9, w7
633	zip1		v26.4s, v10.4s, v11.4s
634	  eor		a10, a10, w8
635	zip2		v27.4s, v10.4s, v11.4s
636	  eor		a11, a11, w9
637	  ldp		w6, w7, [x2, #-16]
638	zip1		v28.4s, v12.4s, v13.4s
639	  ldp		w8, w9, [x2, #-8]
640	  eor		a12, a12, w6
641	zip2		v29.4s, v12.4s, v13.4s
642	  eor		a13, a13, w7
643	zip1		v30.4s, v14.4s, v15.4s
644	  eor		a14, a14, w8
645	zip2		v31.4s, v14.4s, v15.4s
646	  eor		a15, a15, w9
647
648	mov		x3, #64
649	subs		x5, x4, #128
650	add		x6, x5, x2
651	csel		x3, x3, xzr, ge
652	csel		x2, x2, x6, ge
653
654	// interleave 64-bit words in state n, n+2
655	zip1		v0.2d, v16.2d, v18.2d
656	zip2		v4.2d, v16.2d, v18.2d
657	  stp		a0, a1, [x1], #64
658	zip1		v8.2d, v17.2d, v19.2d
659	zip2		v12.2d, v17.2d, v19.2d
660	  stp		a2, a3, [x1, #-56]
661	ld1		{v16.16b-v19.16b}, [x2], x3
662
663	subs		x6, x4, #192
664	ccmp		x3, xzr, #4, lt
665	add		x7, x6, x2
666	csel		x3, x3, xzr, eq
667	csel		x2, x2, x7, eq
668
669	zip1		v1.2d, v20.2d, v22.2d
670	zip2		v5.2d, v20.2d, v22.2d
671	  stp		a4, a5, [x1, #-48]
672	zip1		v9.2d, v21.2d, v23.2d
673	zip2		v13.2d, v21.2d, v23.2d
674	  stp		a6, a7, [x1, #-40]
675	ld1		{v20.16b-v23.16b}, [x2], x3
676
677	subs		x7, x4, #256
678	ccmp		x3, xzr, #4, lt
679	add		x8, x7, x2
680	csel		x3, x3, xzr, eq
681	csel		x2, x2, x8, eq
682
683	zip1		v2.2d, v24.2d, v26.2d
684	zip2		v6.2d, v24.2d, v26.2d
685	  stp		a8, a9, [x1, #-32]
686	zip1		v10.2d, v25.2d, v27.2d
687	zip2		v14.2d, v25.2d, v27.2d
688	  stp		a10, a11, [x1, #-24]
689	ld1		{v24.16b-v27.16b}, [x2], x3
690
691	subs		x8, x4, #320
692	ccmp		x3, xzr, #4, lt
693	add		x9, x8, x2
694	csel		x2, x2, x9, eq
695
696	zip1		v3.2d, v28.2d, v30.2d
697	zip2		v7.2d, v28.2d, v30.2d
698	  stp		a12, a13, [x1, #-16]
699	zip1		v11.2d, v29.2d, v31.2d
700	zip2		v15.2d, v29.2d, v31.2d
701	  stp		a14, a15, [x1, #-8]
702	ld1		{v28.16b-v31.16b}, [x2]
703
704	// xor with corresponding input, write to output
705	tbnz		x5, #63, 0f
706	eor		v16.16b, v16.16b, v0.16b
707	eor		v17.16b, v17.16b, v1.16b
708	eor		v18.16b, v18.16b, v2.16b
709	eor		v19.16b, v19.16b, v3.16b
710	st1		{v16.16b-v19.16b}, [x1], #64
711	cbz		x5, .Lout
712
713	tbnz		x6, #63, 1f
714	eor		v20.16b, v20.16b, v4.16b
715	eor		v21.16b, v21.16b, v5.16b
716	eor		v22.16b, v22.16b, v6.16b
717	eor		v23.16b, v23.16b, v7.16b
718	st1		{v20.16b-v23.16b}, [x1], #64
719	cbz		x6, .Lout
720
721	tbnz		x7, #63, 2f
722	eor		v24.16b, v24.16b, v8.16b
723	eor		v25.16b, v25.16b, v9.16b
724	eor		v26.16b, v26.16b, v10.16b
725	eor		v27.16b, v27.16b, v11.16b
726	st1		{v24.16b-v27.16b}, [x1], #64
727	cbz		x7, .Lout
728
729	tbnz		x8, #63, 3f
730	eor		v28.16b, v28.16b, v12.16b
731	eor		v29.16b, v29.16b, v13.16b
732	eor		v30.16b, v30.16b, v14.16b
733	eor		v31.16b, v31.16b, v15.16b
734	st1		{v28.16b-v31.16b}, [x1]
735
736.Lout:	frame_pop
737	ret
738
739	// fewer than 128 bytes of in/output
7400:	ld1		{v8.16b}, [x10]
741	ld1		{v9.16b}, [x11]
742	movi		v10.16b, #16
743	sub		x2, x1, #64
744	add		x1, x1, x5
745	ld1		{v16.16b-v19.16b}, [x2]
746	tbl		v4.16b, {v0.16b-v3.16b}, v8.16b
747	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
748	add		v8.16b, v8.16b, v10.16b
749	add		v9.16b, v9.16b, v10.16b
750	tbl		v5.16b, {v0.16b-v3.16b}, v8.16b
751	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
752	add		v8.16b, v8.16b, v10.16b
753	add		v9.16b, v9.16b, v10.16b
754	tbl		v6.16b, {v0.16b-v3.16b}, v8.16b
755	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
756	add		v8.16b, v8.16b, v10.16b
757	add		v9.16b, v9.16b, v10.16b
758	tbl		v7.16b, {v0.16b-v3.16b}, v8.16b
759	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
760
761	eor		v20.16b, v20.16b, v4.16b
762	eor		v21.16b, v21.16b, v5.16b
763	eor		v22.16b, v22.16b, v6.16b
764	eor		v23.16b, v23.16b, v7.16b
765	st1		{v20.16b-v23.16b}, [x1]
766	b		.Lout
767
768	// fewer than 192 bytes of in/output
7691:	ld1		{v8.16b}, [x10]
770	ld1		{v9.16b}, [x11]
771	movi		v10.16b, #16
772	add		x1, x1, x6
773	tbl		v0.16b, {v4.16b-v7.16b}, v8.16b
774	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
775	add		v8.16b, v8.16b, v10.16b
776	add		v9.16b, v9.16b, v10.16b
777	tbl		v1.16b, {v4.16b-v7.16b}, v8.16b
778	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
779	add		v8.16b, v8.16b, v10.16b
780	add		v9.16b, v9.16b, v10.16b
781	tbl		v2.16b, {v4.16b-v7.16b}, v8.16b
782	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
783	add		v8.16b, v8.16b, v10.16b
784	add		v9.16b, v9.16b, v10.16b
785	tbl		v3.16b, {v4.16b-v7.16b}, v8.16b
786	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
787
788	eor		v20.16b, v20.16b, v0.16b
789	eor		v21.16b, v21.16b, v1.16b
790	eor		v22.16b, v22.16b, v2.16b
791	eor		v23.16b, v23.16b, v3.16b
792	st1		{v20.16b-v23.16b}, [x1]
793	b		.Lout
794
795	// fewer than 256 bytes of in/output
7962:	ld1		{v4.16b}, [x10]
797	ld1		{v5.16b}, [x11]
798	movi		v6.16b, #16
799	add		x1, x1, x7
800	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
801	tbx		v24.16b, {v20.16b-v23.16b}, v5.16b
802	add		v4.16b, v4.16b, v6.16b
803	add		v5.16b, v5.16b, v6.16b
804	tbl		v1.16b, {v8.16b-v11.16b}, v4.16b
805	tbx		v25.16b, {v20.16b-v23.16b}, v5.16b
806	add		v4.16b, v4.16b, v6.16b
807	add		v5.16b, v5.16b, v6.16b
808	tbl		v2.16b, {v8.16b-v11.16b}, v4.16b
809	tbx		v26.16b, {v20.16b-v23.16b}, v5.16b
810	add		v4.16b, v4.16b, v6.16b
811	add		v5.16b, v5.16b, v6.16b
812	tbl		v3.16b, {v8.16b-v11.16b}, v4.16b
813	tbx		v27.16b, {v20.16b-v23.16b}, v5.16b
814
815	eor		v24.16b, v24.16b, v0.16b
816	eor		v25.16b, v25.16b, v1.16b
817	eor		v26.16b, v26.16b, v2.16b
818	eor		v27.16b, v27.16b, v3.16b
819	st1		{v24.16b-v27.16b}, [x1]
820	b		.Lout
821
822	// fewer than 320 bytes of in/output
8233:	ld1		{v4.16b}, [x10]
824	ld1		{v5.16b}, [x11]
825	movi		v6.16b, #16
826	add		x1, x1, x8
827	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
828	tbx		v28.16b, {v24.16b-v27.16b}, v5.16b
829	add		v4.16b, v4.16b, v6.16b
830	add		v5.16b, v5.16b, v6.16b
831	tbl		v1.16b, {v12.16b-v15.16b}, v4.16b
832	tbx		v29.16b, {v24.16b-v27.16b}, v5.16b
833	add		v4.16b, v4.16b, v6.16b
834	add		v5.16b, v5.16b, v6.16b
835	tbl		v2.16b, {v12.16b-v15.16b}, v4.16b
836	tbx		v30.16b, {v24.16b-v27.16b}, v5.16b
837	add		v4.16b, v4.16b, v6.16b
838	add		v5.16b, v5.16b, v6.16b
839	tbl		v3.16b, {v12.16b-v15.16b}, v4.16b
840	tbx		v31.16b, {v24.16b-v27.16b}, v5.16b
841
842	eor		v28.16b, v28.16b, v0.16b
843	eor		v29.16b, v29.16b, v1.16b
844	eor		v30.16b, v30.16b, v2.16b
845	eor		v31.16b, v31.16b, v3.16b
846	st1		{v28.16b-v31.16b}, [x1]
847	b		.Lout
848ENDPROC(chacha_4block_xor_neon)
849
850	.section	".rodata", "a", %progbits
851	.align		L1_CACHE_SHIFT
852.Lpermute:
853	.set		.Li, 0
854	.rept		192
855	.byte		(.Li - 64)
856	.set		.Li, .Li + 1
857	.endr
858
859CTRINC:	.word		1, 2, 3, 4
860ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
861