1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
4 *
5 * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11
12#include <linux/linkage.h>
13#include <asm/assembler.h>
14
15	.irp	b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
16	.set	.Lv\b\().2d, \b
17	.set	.Lv\b\().16b, \b
18	.endr
19
20	/*
21	 * ARMv8.2 Crypto Extensions instructions
22	 */
23	.macro	eor3, rd, rn, rm, ra
24	.inst	0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
25	.endm
26
27	.macro	rax1, rd, rn, rm
28	.inst	0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
29	.endm
30
31	.macro	bcax, rd, rn, rm, ra
32	.inst	0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
33	.endm
34
35	.macro	xar, rd, rn, rm, imm6
36	.inst	0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
37	.endm
38
39	/*
40	 * sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
41	 */
42	.text
43ENTRY(sha3_ce_transform)
44	frame_push	4
45
46	mov	x19, x0
47	mov	x20, x1
48	mov	x21, x2
49	mov	x22, x3
50
510:	/* load state */
52	add	x8, x19, #32
53	ld1	{ v0.1d- v3.1d}, [x19]
54	ld1	{ v4.1d- v7.1d}, [x8], #32
55	ld1	{ v8.1d-v11.1d}, [x8], #32
56	ld1	{v12.1d-v15.1d}, [x8], #32
57	ld1	{v16.1d-v19.1d}, [x8], #32
58	ld1	{v20.1d-v23.1d}, [x8], #32
59	ld1	{v24.1d}, [x8]
60
611:	sub	w21, w21, #1
62	mov	w8, #24
63	adr_l	x9, .Lsha3_rcon
64
65	/* load input */
66	ld1	{v25.8b-v28.8b}, [x20], #32
67	ld1	{v29.8b-v31.8b}, [x20], #24
68	eor	v0.8b, v0.8b, v25.8b
69	eor	v1.8b, v1.8b, v26.8b
70	eor	v2.8b, v2.8b, v27.8b
71	eor	v3.8b, v3.8b, v28.8b
72	eor	v4.8b, v4.8b, v29.8b
73	eor	v5.8b, v5.8b, v30.8b
74	eor	v6.8b, v6.8b, v31.8b
75
76	tbnz	x22, #6, 3f		// SHA3-512
77
78	ld1	{v25.8b-v28.8b}, [x20], #32
79	ld1	{v29.8b-v30.8b}, [x20], #16
80	eor	 v7.8b,  v7.8b, v25.8b
81	eor	 v8.8b,  v8.8b, v26.8b
82	eor	 v9.8b,  v9.8b, v27.8b
83	eor	v10.8b, v10.8b, v28.8b
84	eor	v11.8b, v11.8b, v29.8b
85	eor	v12.8b, v12.8b, v30.8b
86
87	tbnz	x22, #4, 2f		// SHA3-384 or SHA3-224
88
89	// SHA3-256
90	ld1	{v25.8b-v28.8b}, [x20], #32
91	eor	v13.8b, v13.8b, v25.8b
92	eor	v14.8b, v14.8b, v26.8b
93	eor	v15.8b, v15.8b, v27.8b
94	eor	v16.8b, v16.8b, v28.8b
95	b	4f
96
972:	tbz	x22, #2, 4f		// bit 2 cleared? SHA-384
98
99	// SHA3-224
100	ld1	{v25.8b-v28.8b}, [x20], #32
101	ld1	{v29.8b}, [x20], #8
102	eor	v13.8b, v13.8b, v25.8b
103	eor	v14.8b, v14.8b, v26.8b
104	eor	v15.8b, v15.8b, v27.8b
105	eor	v16.8b, v16.8b, v28.8b
106	eor	v17.8b, v17.8b, v29.8b
107	b	4f
108
109	// SHA3-512
1103:	ld1	{v25.8b-v26.8b}, [x20], #16
111	eor	 v7.8b,  v7.8b, v25.8b
112	eor	 v8.8b,  v8.8b, v26.8b
113
1144:	sub	w8, w8, #1
115
116	eor3	v29.16b,  v4.16b,  v9.16b, v14.16b
117	eor3	v26.16b,  v1.16b,  v6.16b, v11.16b
118	eor3	v28.16b,  v3.16b,  v8.16b, v13.16b
119	eor3	v25.16b,  v0.16b,  v5.16b, v10.16b
120	eor3	v27.16b,  v2.16b,  v7.16b, v12.16b
121	eor3	v29.16b, v29.16b, v19.16b, v24.16b
122	eor3	v26.16b, v26.16b, v16.16b, v21.16b
123	eor3	v28.16b, v28.16b, v18.16b, v23.16b
124	eor3	v25.16b, v25.16b, v15.16b, v20.16b
125	eor3	v27.16b, v27.16b, v17.16b, v22.16b
126
127	rax1	v30.2d, v29.2d, v26.2d	// bc[0]
128	rax1	v26.2d, v26.2d, v28.2d	// bc[2]
129	rax1	v28.2d, v28.2d, v25.2d	// bc[4]
130	rax1	v25.2d, v25.2d, v27.2d	// bc[1]
131	rax1	v27.2d, v27.2d, v29.2d	// bc[3]
132
133	eor	 v0.16b,  v0.16b, v30.16b
134	xar	 v29.2d,   v1.2d,  v25.2d, (64 - 1)
135	xar	  v1.2d,   v6.2d,  v25.2d, (64 - 44)
136	xar	  v6.2d,   v9.2d,  v28.2d, (64 - 20)
137	xar	  v9.2d,  v22.2d,  v26.2d, (64 - 61)
138	xar	 v22.2d,  v14.2d,  v28.2d, (64 - 39)
139	xar	 v14.2d,  v20.2d,  v30.2d, (64 - 18)
140	xar	 v31.2d,   v2.2d,  v26.2d, (64 - 62)
141	xar	  v2.2d,  v12.2d,  v26.2d, (64 - 43)
142	xar	 v12.2d,  v13.2d,  v27.2d, (64 - 25)
143	xar	 v13.2d,  v19.2d,  v28.2d, (64 - 8)
144	xar	 v19.2d,  v23.2d,  v27.2d, (64 - 56)
145	xar	 v23.2d,  v15.2d,  v30.2d, (64 - 41)
146	xar	 v15.2d,   v4.2d,  v28.2d, (64 - 27)
147	xar	 v28.2d,  v24.2d,  v28.2d, (64 - 14)
148	xar	 v24.2d,  v21.2d,  v25.2d, (64 - 2)
149	xar	  v8.2d,   v8.2d,  v27.2d, (64 - 55)
150	xar	  v4.2d,  v16.2d,  v25.2d, (64 - 45)
151	xar	 v16.2d,   v5.2d,  v30.2d, (64 - 36)
152	xar	  v5.2d,   v3.2d,  v27.2d, (64 - 28)
153	xar	 v27.2d,  v18.2d,  v27.2d, (64 - 21)
154	xar	  v3.2d,  v17.2d,  v26.2d, (64 - 15)
155	xar	 v25.2d,  v11.2d,  v25.2d, (64 - 10)
156	xar	 v26.2d,   v7.2d,  v26.2d, (64 - 6)
157	xar	 v30.2d,  v10.2d,  v30.2d, (64 - 3)
158
159	bcax	v20.16b, v31.16b, v22.16b,  v8.16b
160	bcax	v21.16b,  v8.16b, v23.16b, v22.16b
161	bcax	v22.16b, v22.16b, v24.16b, v23.16b
162	bcax	v23.16b, v23.16b, v31.16b, v24.16b
163	bcax	v24.16b, v24.16b,  v8.16b, v31.16b
164
165	ld1r	{v31.2d}, [x9], #8
166
167	bcax	v17.16b, v25.16b, v19.16b,  v3.16b
168	bcax	v18.16b,  v3.16b, v15.16b, v19.16b
169	bcax	v19.16b, v19.16b, v16.16b, v15.16b
170	bcax	v15.16b, v15.16b, v25.16b, v16.16b
171	bcax	v16.16b, v16.16b,  v3.16b, v25.16b
172
173	bcax	v10.16b, v29.16b, v12.16b, v26.16b
174	bcax	v11.16b, v26.16b, v13.16b, v12.16b
175	bcax	v12.16b, v12.16b, v14.16b, v13.16b
176	bcax	v13.16b, v13.16b, v29.16b, v14.16b
177	bcax	v14.16b, v14.16b, v26.16b, v29.16b
178
179	bcax	 v7.16b, v30.16b,  v9.16b,  v4.16b
180	bcax	 v8.16b,  v4.16b,  v5.16b,  v9.16b
181	bcax	 v9.16b,  v9.16b,  v6.16b,  v5.16b
182	bcax	 v5.16b,  v5.16b, v30.16b,  v6.16b
183	bcax	 v6.16b,  v6.16b,  v4.16b, v30.16b
184
185	bcax	 v3.16b, v27.16b,  v0.16b, v28.16b
186	bcax	 v4.16b, v28.16b,  v1.16b,  v0.16b
187	bcax	 v0.16b,  v0.16b,  v2.16b,  v1.16b
188	bcax	 v1.16b,  v1.16b, v27.16b,  v2.16b
189	bcax	 v2.16b,  v2.16b, v28.16b, v27.16b
190
191	eor	 v0.16b,  v0.16b, v31.16b
192
193	cbnz	w8, 4b
194	cbz	w21, 5f
195
196	if_will_cond_yield_neon
197	add	x8, x19, #32
198	st1	{ v0.1d- v3.1d}, [x19]
199	st1	{ v4.1d- v7.1d}, [x8], #32
200	st1	{ v8.1d-v11.1d}, [x8], #32
201	st1	{v12.1d-v15.1d}, [x8], #32
202	st1	{v16.1d-v19.1d}, [x8], #32
203	st1	{v20.1d-v23.1d}, [x8], #32
204	st1	{v24.1d}, [x8]
205	do_cond_yield_neon
206	b		0b
207	endif_yield_neon
208
209	b	1b
210
211	/* save state */
2125:	st1	{ v0.1d- v3.1d}, [x19], #32
213	st1	{ v4.1d- v7.1d}, [x19], #32
214	st1	{ v8.1d-v11.1d}, [x19], #32
215	st1	{v12.1d-v15.1d}, [x19], #32
216	st1	{v16.1d-v19.1d}, [x19], #32
217	st1	{v20.1d-v23.1d}, [x19], #32
218	st1	{v24.1d}, [x19]
219	frame_pop
220	ret
221ENDPROC(sha3_ce_transform)
222
223	.section	".rodata", "a"
224	.align		8
225.Lsha3_rcon:
226	.quad	0x0000000000000001, 0x0000000000008082, 0x800000000000808a
227	.quad	0x8000000080008000, 0x000000000000808b, 0x0000000080000001
228	.quad	0x8000000080008081, 0x8000000000008009, 0x000000000000008a
229	.quad	0x0000000000000088, 0x0000000080008009, 0x000000008000000a
230	.quad	0x000000008000808b, 0x800000000000008b, 0x8000000000008089
231	.quad	0x8000000000008003, 0x8000000000008002, 0x8000000000000080
232	.quad	0x000000000000800a, 0x800000008000000a, 0x8000000080008081
233	.quad	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
234