1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
4 *
5 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	SHASH		.req	q0
12	T1		.req	q1
13	XL		.req	q2
14	XM		.req	q3
15	XH		.req	q4
16	IN1		.req	q4
17
18	SHASH_L		.req	d0
19	SHASH_H		.req	d1
20	T1_L		.req	d2
21	T1_H		.req	d3
22	XL_L		.req	d4
23	XL_H		.req	d5
24	XM_L		.req	d6
25	XM_H		.req	d7
26	XH_L		.req	d8
27
28	t0l		.req	d10
29	t0h		.req	d11
30	t1l		.req	d12
31	t1h		.req	d13
32	t2l		.req	d14
33	t2h		.req	d15
34	t3l		.req	d16
35	t3h		.req	d17
36	t4l		.req	d18
37	t4h		.req	d19
38
39	t0q		.req	q5
40	t1q		.req	q6
41	t2q		.req	q7
42	t3q		.req	q8
43	t4q		.req	q9
44	T2		.req	q9
45
46	s1l		.req	d20
47	s1h		.req	d21
48	s2l		.req	d22
49	s2h		.req	d23
50	s3l		.req	d24
51	s3h		.req	d25
52	s4l		.req	d26
53	s4h		.req	d27
54
55	MASK		.req	d28
56	SHASH2_p8	.req	d28
57
58	k16		.req	d29
59	k32		.req	d30
60	k48		.req	d31
61	SHASH2_p64	.req	d31
62
63	HH		.req	q10
64	HH3		.req	q11
65	HH4		.req	q12
66	HH34		.req	q13
67
68	HH_L		.req	d20
69	HH_H		.req	d21
70	HH3_L		.req	d22
71	HH3_H		.req	d23
72	HH4_L		.req	d24
73	HH4_H		.req	d25
74	HH34_L		.req	d26
75	HH34_H		.req	d27
76	SHASH2_H	.req	d29
77
78	XL2		.req	q5
79	XM2		.req	q6
80	XH2		.req	q7
81	T3		.req	q8
82
83	XL2_L		.req	d10
84	XL2_H		.req	d11
85	XM2_L		.req	d12
86	XM2_H		.req	d13
87	T3_L		.req	d16
88	T3_H		.req	d17
89
90	.text
91	.fpu		crypto-neon-fp-armv8
92
93	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
94	vmull.p64	\rd, \rn, \rm
95	.endm
96
97	/*
98	 * This implementation of 64x64 -> 128 bit polynomial multiplication
99	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
100	 * "Fast Software Polynomial Multiplication on ARM Processors Using
101	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
102	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
103	 *
104	 * It has been slightly tweaked for in-order performance, and to allow
105	 * 'rq' to overlap with 'ad' or 'bd'.
106	 */
107	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
108	vext.8		t0l, \ad, \ad, #1	@ A1
109	.ifc		\b1, t4l
110	vext.8		t4l, \bd, \bd, #1	@ B1
111	.endif
112	vmull.p8	t0q, t0l, \bd		@ F = A1*B
113	vext.8		t1l, \ad, \ad, #2	@ A2
114	vmull.p8	t4q, \ad, \b1		@ E = A*B1
115	.ifc		\b2, t3l
116	vext.8		t3l, \bd, \bd, #2	@ B2
117	.endif
118	vmull.p8	t1q, t1l, \bd		@ H = A2*B
119	vext.8		t2l, \ad, \ad, #3	@ A3
120	vmull.p8	t3q, \ad, \b2		@ G = A*B2
121	veor		t0q, t0q, t4q		@ L = E + F
122	.ifc		\b3, t4l
123	vext.8		t4l, \bd, \bd, #3	@ B3
124	.endif
125	vmull.p8	t2q, t2l, \bd		@ J = A3*B
126	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
127	veor		t1q, t1q, t3q		@ M = G + H
128	.ifc		\b4, t3l
129	vext.8		t3l, \bd, \bd, #4	@ B4
130	.endif
131	vmull.p8	t4q, \ad, \b3		@ I = A*B3
132	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
133	vmull.p8	t3q, \ad, \b4		@ K = A*B4
134	vand		t0h, t0h, k48
135	vand		t1h, t1h, k32
136	veor		t2q, t2q, t4q		@ N = I + J
137	veor		t0l, t0l, t0h
138	veor		t1l, t1l, t1h
139	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
140	vand		t2h, t2h, k16
141	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
142	vmov.i64	t3h, #0
143	vext.8		t0q, t0q, t0q, #15
144	veor		t2l, t2l, t2h
145	vext.8		t1q, t1q, t1q, #14
146	vmull.p8	\rq, \ad, \bd		@ D = A*B
147	vext.8		t2q, t2q, t2q, #13
148	vext.8		t3q, t3q, t3q, #12
149	veor		t0q, t0q, t1q
150	veor		t2q, t2q, t3q
151	veor		\rq, \rq, t0q
152	veor		\rq, \rq, t2q
153	.endm
154
155	//
156	// PMULL (64x64->128) based reduction for CPUs that can do
157	// it in a single instruction.
158	//
159	.macro		__pmull_reduce_p64
160	vmull.p64	T1, XL_L, MASK
161
162	veor		XH_L, XH_L, XM_H
163	vext.8		T1, T1, T1, #8
164	veor		XL_H, XL_H, XM_L
165	veor		T1, T1, XL
166
167	vmull.p64	XL, T1_H, MASK
168	.endm
169
170	//
171	// Alternative reduction for CPUs that lack support for the
172	// 64x64->128 PMULL instruction
173	//
174	.macro		__pmull_reduce_p8
175	veor		XL_H, XL_H, XM_L
176	veor		XH_L, XH_L, XM_H
177
178	vshl.i64	T1, XL, #57
179	vshl.i64	T2, XL, #62
180	veor		T1, T1, T2
181	vshl.i64	T2, XL, #63
182	veor		T1, T1, T2
183	veor		XL_H, XL_H, T1_L
184	veor		XH_L, XH_L, T1_H
185
186	vshr.u64	T1, XL, #1
187	veor		XH, XH, XL
188	veor		XL, XL, T1
189	vshr.u64	T1, T1, #6
190	vshr.u64	XL, XL, #1
191	.endm
192
193	.macro		ghash_update, pn
194	vld1.64		{XL}, [r1]
195
196	/* do the head block first, if supplied */
197	ldr		ip, [sp]
198	teq		ip, #0
199	beq		0f
200	vld1.64		{T1}, [ip]
201	teq		r0, #0
202	b		3f
203
2040:	.ifc		\pn, p64
205	tst		r0, #3			// skip until #blocks is a
206	bne		2f			// round multiple of 4
207
208	vld1.8		{XL2-XM2}, [r2]!
2091:	vld1.8		{T3-T2}, [r2]!
210	vrev64.8	XL2, XL2
211	vrev64.8	XM2, XM2
212
213	subs		r0, r0, #4
214
215	vext.8		T1, XL2, XL2, #8
216	veor		XL2_H, XL2_H, XL_L
217	veor		XL, XL, T1
218
219	vrev64.8	T3, T3
220	vrev64.8	T1, T2
221
222	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
223	veor		XL2_H, XL2_H, XL_H
224	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
225	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
226
227	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
228	veor		XM2_L, XM2_L, XM2_H
229	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
230	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
231
232	veor		XH, XH, XH2
233	veor		XL, XL, XL2
234	veor		XM, XM, XM2
235
236	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
237	veor		T3_L, T3_L, T3_H
238	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
239	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
240
241	veor		XH, XH, XH2
242	veor		XL, XL, XL2
243	veor		XM, XM, XM2
244
245	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
246	veor		T1_L, T1_L, T1_H
247	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
248	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
249
250	veor		XH, XH, XH2
251	veor		XL, XL, XL2
252	veor		XM, XM, XM2
253
254	beq		4f
255
256	vld1.8		{XL2-XM2}, [r2]!
257
258	veor		T1, XL, XH
259	veor		XM, XM, T1
260
261	__pmull_reduce_p64
262
263	veor		T1, T1, XH
264	veor		XL, XL, T1
265
266	b		1b
267	.endif
268
2692:	vld1.64		{T1}, [r2]!
270	subs		r0, r0, #1
271
2723:	/* multiply XL by SHASH in GF(2^128) */
273#ifndef CONFIG_CPU_BIG_ENDIAN
274	vrev64.8	T1, T1
275#endif
276	vext.8		IN1, T1, T1, #8
277	veor		T1_L, T1_L, XL_H
278	veor		XL, XL, IN1
279
280	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
281	veor		T1, T1, XL
282	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
283	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
284
2854:	veor		T1, XL, XH
286	veor		XM, XM, T1
287
288	__pmull_reduce_\pn
289
290	veor		T1, T1, XH
291	veor		XL, XL, T1
292
293	bne		0b
294
295	vst1.64		{XL}, [r1]
296	bx		lr
297	.endm
298
299	/*
300	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
301	 *			   struct ghash_key const *k, const char *head)
302	 */
303ENTRY(pmull_ghash_update_p64)
304	vld1.64		{SHASH}, [r3]!
305	vld1.64		{HH}, [r3]!
306	vld1.64		{HH3-HH4}, [r3]
307
308	veor		SHASH2_p64, SHASH_L, SHASH_H
309	veor		SHASH2_H, HH_L, HH_H
310	veor		HH34_L, HH3_L, HH3_H
311	veor		HH34_H, HH4_L, HH4_H
312
313	vmov.i8		MASK, #0xe1
314	vshl.u64	MASK, MASK, #57
315
316	ghash_update	p64
317ENDPROC(pmull_ghash_update_p64)
318
319ENTRY(pmull_ghash_update_p8)
320	vld1.64		{SHASH}, [r3]
321	veor		SHASH2_p8, SHASH_L, SHASH_H
322
323	vext.8		s1l, SHASH_L, SHASH_L, #1
324	vext.8		s2l, SHASH_L, SHASH_L, #2
325	vext.8		s3l, SHASH_L, SHASH_L, #3
326	vext.8		s4l, SHASH_L, SHASH_L, #4
327	vext.8		s1h, SHASH_H, SHASH_H, #1
328	vext.8		s2h, SHASH_H, SHASH_H, #2
329	vext.8		s3h, SHASH_H, SHASH_H, #3
330	vext.8		s4h, SHASH_H, SHASH_H, #4
331
332	vmov.i64	k16, #0xffff
333	vmov.i64	k32, #0xffffffff
334	vmov.i64	k48, #0xffffffffffff
335
336	ghash_update	p8
337ENDPROC(pmull_ghash_update_p8)
338