1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * SM4 Cipher Algorithm for ARMv8 NEON
4 * as specified in
5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
6 *
7 * Copyright (C) 2022, Alibaba Group.
8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14/* Register macros */
15
16#define RTMP0	v8
17#define RTMP1	v9
18#define RTMP2	v10
19#define RTMP3	v11
20
21#define RX0	v12
22#define RX1	v13
23#define RKEY	v14
24#define RIV	v15
25
26/* Helper macros. */
27
28#define PREPARE                                                 \
29	adr_l		x5, crypto_sm4_sbox;                    \
30	ld1		{v16.16b-v19.16b}, [x5], #64;           \
31	ld1		{v20.16b-v23.16b}, [x5], #64;           \
32	ld1		{v24.16b-v27.16b}, [x5], #64;           \
33	ld1		{v28.16b-v31.16b}, [x5];
34
35#define transpose_4x4(s0, s1, s2, s3)                           \
36	zip1		RTMP0.4s, s0.4s, s1.4s;                 \
37	zip1		RTMP1.4s, s2.4s, s3.4s;                 \
38	zip2		RTMP2.4s, s0.4s, s1.4s;                 \
39	zip2		RTMP3.4s, s2.4s, s3.4s;                 \
40	zip1		s0.2d, RTMP0.2d, RTMP1.2d;              \
41	zip2		s1.2d, RTMP0.2d, RTMP1.2d;              \
42	zip1		s2.2d, RTMP2.2d, RTMP3.2d;              \
43	zip2		s3.2d, RTMP2.2d, RTMP3.2d;
44
45#define rotate_clockwise_90(s0, s1, s2, s3)                     \
46	zip1		RTMP0.4s, s1.4s, s0.4s;                 \
47	zip2		RTMP1.4s, s1.4s, s0.4s;                 \
48	zip1		RTMP2.4s, s3.4s, s2.4s;                 \
49	zip2		RTMP3.4s, s3.4s, s2.4s;                 \
50	zip1		s0.2d, RTMP2.2d, RTMP0.2d;              \
51	zip2		s1.2d, RTMP2.2d, RTMP0.2d;              \
52	zip1		s2.2d, RTMP3.2d, RTMP1.2d;              \
53	zip2		s3.2d, RTMP3.2d, RTMP1.2d;
54
55#define ROUND4(round, s0, s1, s2, s3)                           \
56	dup		RX0.4s, RKEY.s[round];                  \
57	/* rk ^ s1 ^ s2 ^ s3 */                                 \
58	eor		RTMP1.16b, s2.16b, s3.16b;              \
59	eor		RX0.16b, RX0.16b, s1.16b;               \
60	eor		RX0.16b, RX0.16b, RTMP1.16b;            \
61                                                                \
62	/* sbox, non-linear part */                             \
63	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
64	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
65	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
66	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
67	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
68	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
69	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
70	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
71                                                                \
72	/* linear part */                                       \
73	shl		RTMP1.4s, RTMP0.4s, #8;                 \
74	shl		RTMP2.4s, RTMP0.4s, #16;                \
75	shl		RTMP3.4s, RTMP0.4s, #24;                \
76	sri		RTMP1.4s, RTMP0.4s, #(32-8);            \
77	sri		RTMP2.4s, RTMP0.4s, #(32-16);           \
78	sri		RTMP3.4s, RTMP0.4s, #(32-24);           \
79	/* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */            \
80	eor		RTMP1.16b, RTMP1.16b, RTMP0.16b;        \
81	eor		RTMP1.16b, RTMP1.16b, RTMP2.16b;        \
82	/* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */        \
83	eor		RTMP3.16b, RTMP3.16b, RTMP0.16b;        \
84	shl		RTMP2.4s, RTMP1.4s, 2;                  \
85	sri		RTMP2.4s, RTMP1.4s, #(32-2);            \
86	eor		RTMP3.16b, RTMP3.16b, RTMP2.16b;        \
87	/* s0 ^= RTMP3 */                                       \
88	eor		s0.16b, s0.16b, RTMP3.16b;
89
90#define SM4_CRYPT_BLK4(b0, b1, b2, b3)                          \
91	rev32		b0.16b, b0.16b;                         \
92	rev32		b1.16b, b1.16b;                         \
93	rev32		b2.16b, b2.16b;                         \
94	rev32		b3.16b, b3.16b;                         \
95                                                                \
96	transpose_4x4(b0, b1, b2, b3);                          \
97                                                                \
98	mov		x6, 8;                                  \
994:                                                              \
100	ld1		{RKEY.4s}, [x0], #16;                   \
101	subs		x6, x6, #1;                             \
102                                                                \
103	ROUND4(0, b0, b1, b2, b3);                              \
104	ROUND4(1, b1, b2, b3, b0);                              \
105	ROUND4(2, b2, b3, b0, b1);                              \
106	ROUND4(3, b3, b0, b1, b2);                              \
107                                                                \
108	bne		4b;                                     \
109                                                                \
110	rotate_clockwise_90(b0, b1, b2, b3);                    \
111	rev32		b0.16b, b0.16b;                         \
112	rev32		b1.16b, b1.16b;                         \
113	rev32		b2.16b, b2.16b;                         \
114	rev32		b3.16b, b3.16b;                         \
115                                                                \
116	/* repoint to rkey */                                   \
117	sub		x0, x0, #128;
118
119#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3)           \
120	/* rk ^ s1 ^ s2 ^ s3 */                                 \
121	dup		RX0.4s, RKEY.s[round];                  \
122	eor		RTMP0.16b, s2.16b, s3.16b;              \
123	mov		RX1.16b, RX0.16b;                       \
124	eor		RTMP1.16b, t2.16b, t3.16b;              \
125	eor		RX0.16b, RX0.16b, s1.16b;               \
126	eor		RX1.16b, RX1.16b, t1.16b;               \
127	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
128	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
129                                                                \
130	/* sbox, non-linear part */                             \
131	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
132	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
133	tbl		RTMP1.16b, {v16.16b-v19.16b}, RX1.16b;  \
134	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
135	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
136	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
137	tbx		RTMP1.16b, {v20.16b-v23.16b}, RX1.16b;  \
138	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
139	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
140	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
141	tbx		RTMP1.16b, {v24.16b-v27.16b}, RX1.16b;  \
142	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
143	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
144	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
145	tbx		RTMP1.16b, {v28.16b-v31.16b}, RX1.16b;  \
146                                                                \
147	/* linear part */                                       \
148	shl		RX0.4s, RTMP0.4s, #8;                   \
149	shl		RX1.4s, RTMP1.4s, #8;                   \
150	shl		RTMP2.4s, RTMP0.4s, #16;                \
151	shl		RTMP3.4s, RTMP1.4s, #16;                \
152	sri		RX0.4s, RTMP0.4s, #(32 - 8);            \
153	sri		RX1.4s, RTMP1.4s, #(32 - 8);            \
154	sri		RTMP2.4s, RTMP0.4s, #(32 - 16);         \
155	sri		RTMP3.4s, RTMP1.4s, #(32 - 16);         \
156	/* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */               \
157	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
158	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
159	eor		RX0.16b, RX0.16b, RTMP2.16b;            \
160	eor		RX1.16b, RX1.16b, RTMP3.16b;            \
161	/* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */        \
162	shl		RTMP2.4s, RTMP0.4s, #24;                \
163	shl		RTMP3.4s, RTMP1.4s, #24;                \
164	sri		RTMP2.4s, RTMP0.4s, #(32 - 24);         \
165	sri		RTMP3.4s, RTMP1.4s, #(32 - 24);         \
166	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
167	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
168	shl		RTMP2.4s, RX0.4s, #2;                   \
169	shl		RTMP3.4s, RX1.4s, #2;                   \
170	sri		RTMP2.4s, RX0.4s, #(32 - 2);            \
171	sri		RTMP3.4s, RX1.4s, #(32 - 2);            \
172	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
173	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
174	/* s0/t0 ^= RTMP0/1 */                                  \
175	eor		s0.16b, s0.16b, RTMP0.16b;              \
176	eor		t0.16b, t0.16b, RTMP1.16b;
177
178#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7)          \
179	rev32		b0.16b, b0.16b;                         \
180	rev32		b1.16b, b1.16b;                         \
181	rev32		b2.16b, b2.16b;                         \
182	rev32		b3.16b, b3.16b;                         \
183	rev32		b4.16b, b4.16b;                         \
184	rev32		b5.16b, b5.16b;                         \
185	rev32		b6.16b, b6.16b;                         \
186	rev32		b7.16b, b7.16b;                         \
187                                                                \
188	transpose_4x4(b0, b1, b2, b3);                          \
189	transpose_4x4(b4, b5, b6, b7);                          \
190                                                                \
191	mov		x6, 8;                                  \
1928:                                                              \
193	ld1		{RKEY.4s}, [x0], #16;                   \
194	subs		x6, x6, #1;                             \
195                                                                \
196	ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7);              \
197	ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4);              \
198	ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5);              \
199	ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6);              \
200                                                                \
201	bne		8b;                                     \
202                                                                \
203	rotate_clockwise_90(b0, b1, b2, b3);                    \
204	rotate_clockwise_90(b4, b5, b6, b7);                    \
205	rev32		b0.16b, b0.16b;                         \
206	rev32		b1.16b, b1.16b;                         \
207	rev32		b2.16b, b2.16b;                         \
208	rev32		b3.16b, b3.16b;                         \
209	rev32		b4.16b, b4.16b;                         \
210	rev32		b5.16b, b5.16b;                         \
211	rev32		b6.16b, b6.16b;                         \
212	rev32		b7.16b, b7.16b;                         \
213                                                                \
214	/* repoint to rkey */                                   \
215	sub		x0, x0, #128;
216
217
218.align 3
219SYM_FUNC_START_LOCAL(__sm4_neon_crypt_blk1_4)
220	/* input:
221	 *   x0: round key array, CTX
222	 *   x1: dst
223	 *   x2: src
224	 *   w3: num blocks (1..4)
225	 */
226	PREPARE;
227
228	ld1		{v0.16b}, [x2], #16;
229	mov		v1.16b, v0.16b;
230	mov		v2.16b, v0.16b;
231	mov		v3.16b, v0.16b;
232	cmp		w3, #2;
233	blt		.Lblk4_load_input_done;
234	ld1		{v1.16b}, [x2], #16;
235	beq		.Lblk4_load_input_done;
236	ld1		{v2.16b}, [x2], #16;
237	cmp		w3, #3;
238	beq		.Lblk4_load_input_done;
239	ld1		{v3.16b}, [x2];
240
241.Lblk4_load_input_done:
242	SM4_CRYPT_BLK4(v0, v1, v2, v3);
243
244	st1		{v0.16b}, [x1], #16;
245	cmp		w3, #2;
246	blt		.Lblk4_store_output_done;
247	st1		{v1.16b}, [x1], #16;
248	beq		.Lblk4_store_output_done;
249	st1		{v2.16b}, [x1], #16;
250	cmp		w3, #3;
251	beq		.Lblk4_store_output_done;
252	st1		{v3.16b}, [x1];
253
254.Lblk4_store_output_done:
255	ret;
256SYM_FUNC_END(__sm4_neon_crypt_blk1_4)
257
258.align 3
259SYM_FUNC_START(sm4_neon_crypt_blk1_8)
260	/* input:
261	 *   x0: round key array, CTX
262	 *   x1: dst
263	 *   x2: src
264	 *   w3: num blocks (1..8)
265	 */
266	cmp		w3, #5;
267	blt		__sm4_neon_crypt_blk1_4;
268
269	PREPARE;
270
271	ld1		{v0.16b-v3.16b}, [x2], #64;
272	ld1		{v4.16b}, [x2], #16;
273	mov		v5.16b, v4.16b;
274	mov		v6.16b, v4.16b;
275	mov		v7.16b, v4.16b;
276	beq		.Lblk8_load_input_done;
277	ld1		{v5.16b}, [x2], #16;
278	cmp		w3, #7;
279	blt		.Lblk8_load_input_done;
280	ld1		{v6.16b}, [x2], #16;
281	beq		.Lblk8_load_input_done;
282	ld1		{v7.16b}, [x2];
283
284.Lblk8_load_input_done:
285	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
286
287	cmp		w3, #6;
288	st1		{v0.16b-v3.16b}, [x1], #64;
289	st1		{v4.16b}, [x1], #16;
290	blt		.Lblk8_store_output_done;
291	st1		{v5.16b}, [x1], #16;
292	beq		.Lblk8_store_output_done;
293	st1		{v6.16b}, [x1], #16;
294	cmp		w3, #7;
295	beq		.Lblk8_store_output_done;
296	st1		{v7.16b}, [x1];
297
298.Lblk8_store_output_done:
299	ret;
300SYM_FUNC_END(sm4_neon_crypt_blk1_8)
301
302.align 3
303SYM_FUNC_START(sm4_neon_crypt_blk8)
304	/* input:
305	 *   x0: round key array, CTX
306	 *   x1: dst
307	 *   x2: src
308	 *   w3: nblocks (multiples of 8)
309	 */
310	PREPARE;
311
312.Lcrypt_loop_blk:
313	subs		w3, w3, #8;
314	bmi		.Lcrypt_end;
315
316	ld1		{v0.16b-v3.16b}, [x2], #64;
317	ld1		{v4.16b-v7.16b}, [x2], #64;
318
319	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
320
321	st1		{v0.16b-v3.16b}, [x1], #64;
322	st1		{v4.16b-v7.16b}, [x1], #64;
323
324	b		.Lcrypt_loop_blk;
325
326.Lcrypt_end:
327	ret;
328SYM_FUNC_END(sm4_neon_crypt_blk8)
329
330.align 3
331SYM_FUNC_START(sm4_neon_cbc_dec_blk8)
332	/* input:
333	 *   x0: round key array, CTX
334	 *   x1: dst
335	 *   x2: src
336	 *   x3: iv (big endian, 128 bit)
337	 *   w4: nblocks (multiples of 8)
338	 */
339	PREPARE;
340
341	ld1		{RIV.16b}, [x3];
342
343.Lcbc_loop_blk:
344	subs		w4, w4, #8;
345	bmi		.Lcbc_end;
346
347	ld1		{v0.16b-v3.16b}, [x2], #64;
348	ld1		{v4.16b-v7.16b}, [x2];
349
350	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
351
352	sub		x2, x2, #64;
353	eor		v0.16b, v0.16b, RIV.16b;
354	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
355	eor		v1.16b, v1.16b, RTMP0.16b;
356	eor		v2.16b, v2.16b, RTMP1.16b;
357	eor		v3.16b, v3.16b, RTMP2.16b;
358	st1		{v0.16b-v3.16b}, [x1], #64;
359
360	eor		v4.16b, v4.16b, RTMP3.16b;
361	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
362	eor		v5.16b, v5.16b, RTMP0.16b;
363	eor		v6.16b, v6.16b, RTMP1.16b;
364	eor		v7.16b, v7.16b, RTMP2.16b;
365
366	mov		RIV.16b, RTMP3.16b;
367	st1		{v4.16b-v7.16b}, [x1], #64;
368
369	b		.Lcbc_loop_blk;
370
371.Lcbc_end:
372	/* store new IV */
373	st1		{RIV.16b}, [x3];
374
375	ret;
376SYM_FUNC_END(sm4_neon_cbc_dec_blk8)
377
378.align 3
379SYM_FUNC_START(sm4_neon_cfb_dec_blk8)
380	/* input:
381	 *   x0: round key array, CTX
382	 *   x1: dst
383	 *   x2: src
384	 *   x3: iv (big endian, 128 bit)
385	 *   w4: nblocks (multiples of 8)
386	 */
387	PREPARE;
388
389	ld1		{v0.16b}, [x3];
390
391.Lcfb_loop_blk:
392	subs		w4, w4, #8;
393	bmi		.Lcfb_end;
394
395	ld1		{v1.16b, v2.16b, v3.16b}, [x2], #48;
396	ld1		{v4.16b-v7.16b}, [x2];
397
398	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
399
400	sub		x2, x2, #48;
401	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
402	eor		v0.16b, v0.16b, RTMP0.16b;
403	eor		v1.16b, v1.16b, RTMP1.16b;
404	eor		v2.16b, v2.16b, RTMP2.16b;
405	eor		v3.16b, v3.16b, RTMP3.16b;
406	st1		{v0.16b-v3.16b}, [x1], #64;
407
408	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
409	eor		v4.16b, v4.16b, RTMP0.16b;
410	eor		v5.16b, v5.16b, RTMP1.16b;
411	eor		v6.16b, v6.16b, RTMP2.16b;
412	eor		v7.16b, v7.16b, RTMP3.16b;
413	st1		{v4.16b-v7.16b}, [x1], #64;
414
415	mov		v0.16b, RTMP3.16b;
416
417	b		.Lcfb_loop_blk;
418
419.Lcfb_end:
420	/* store new IV */
421	st1		{v0.16b}, [x3];
422
423	ret;
424SYM_FUNC_END(sm4_neon_cfb_dec_blk8)
425
426.align 3
427SYM_FUNC_START(sm4_neon_ctr_enc_blk8)
428	/* input:
429	 *   x0: round key array, CTX
430	 *   x1: dst
431	 *   x2: src
432	 *   x3: ctr (big endian, 128 bit)
433	 *   w4: nblocks (multiples of 8)
434	 */
435	PREPARE;
436
437	ldp		x7, x8, [x3];
438	rev		x7, x7;
439	rev		x8, x8;
440
441.Lctr_loop_blk:
442	subs		w4, w4, #8;
443	bmi		.Lctr_end;
444
445#define inc_le128(vctr)                     \
446	mov		vctr.d[1], x8;      \
447	mov		vctr.d[0], x7;      \
448	adds		x8, x8, #1;         \
449	adc		x7, x7, xzr;        \
450	rev64		vctr.16b, vctr.16b;
451
452	/* construct CTRs */
453	inc_le128(v0);			/* +0 */
454	inc_le128(v1);			/* +1 */
455	inc_le128(v2);			/* +2 */
456	inc_le128(v3);			/* +3 */
457	inc_le128(v4);			/* +4 */
458	inc_le128(v5);			/* +5 */
459	inc_le128(v6);			/* +6 */
460	inc_le128(v7);			/* +7 */
461
462	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
463
464	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
465	eor		v0.16b, v0.16b, RTMP0.16b;
466	eor		v1.16b, v1.16b, RTMP1.16b;
467	eor		v2.16b, v2.16b, RTMP2.16b;
468	eor		v3.16b, v3.16b, RTMP3.16b;
469	st1		{v0.16b-v3.16b}, [x1], #64;
470
471	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
472	eor		v4.16b, v4.16b, RTMP0.16b;
473	eor		v5.16b, v5.16b, RTMP1.16b;
474	eor		v6.16b, v6.16b, RTMP2.16b;
475	eor		v7.16b, v7.16b, RTMP3.16b;
476	st1		{v4.16b-v7.16b}, [x1], #64;
477
478	b		.Lctr_loop_blk;
479
480.Lctr_end:
481	/* store new CTR */
482	rev		x7, x7;
483	rev		x8, x8;
484	stp		x7, x8, [x3];
485
486	ret;
487SYM_FUNC_END(sm4_neon_ctr_enc_blk8)
488