1/*
2 * SSE2 implementation of MORUS-1280
3 *
4 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
5 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 as published
9 * by the Free Software Foundation.
10 */
11
12#include <linux/linkage.h>
13#include <asm/frame.h>
14
15#define SHUFFLE_MASK(i0, i1, i2, i3) \
16	(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
17
18#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
19
20#define STATE0_LO	%xmm0
21#define STATE0_HI	%xmm1
22#define STATE1_LO	%xmm2
23#define STATE1_HI	%xmm3
24#define STATE2_LO	%xmm4
25#define STATE2_HI	%xmm5
26#define STATE3_LO	%xmm6
27#define STATE3_HI	%xmm7
28#define STATE4_LO	%xmm8
29#define STATE4_HI	%xmm9
30#define KEY_LO		%xmm10
31#define KEY_HI		%xmm11
32#define MSG_LO		%xmm10
33#define MSG_HI		%xmm11
34#define T0_LO		%xmm12
35#define T0_HI		%xmm13
36#define T1_LO		%xmm14
37#define T1_HI		%xmm15
38
39.section .rodata.cst16.morus640_const, "aM", @progbits, 16
40.align 16
41.Lmorus640_const_0:
42	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
43	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
44.Lmorus640_const_1:
45	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
46	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
47
48.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
49.align 16
50.Lmorus640_counter_0:
51	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
52	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
53.Lmorus640_counter_1:
54	.byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
55	.byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
56
57.text
58
59.macro rol1 hi, lo
60	/*
61	 * HI_1 | HI_0 || LO_1 | LO_0
62	 *  ==>
63	 * HI_0 | HI_1 || LO_1 | LO_0
64	 *  ==>
65	 * HI_0 | LO_1 || LO_0 | HI_1
66	 */
67	pshufd $MASK2, \hi, \hi
68	movdqa \hi, T0_LO
69	punpcklqdq \lo, T0_LO
70	punpckhqdq \hi, \lo
71	movdqa \lo, \hi
72	movdqa T0_LO, \lo
73.endm
74
75.macro rol2 hi, lo
76	movdqa \lo, T0_LO
77	movdqa \hi, \lo
78	movdqa T0_LO, \hi
79.endm
80
81.macro rol3 hi, lo
82	/*
83	 * HI_1 | HI_0 || LO_1 | LO_0
84	 *  ==>
85	 * HI_0 | HI_1 || LO_1 | LO_0
86	 *  ==>
87	 * LO_0 | HI_1 || HI_0 | LO_1
88	 */
89	pshufd $MASK2, \hi, \hi
90	movdqa \lo, T0_LO
91	punpckhqdq \hi, T0_LO
92	punpcklqdq \lo, \hi
93	movdqa T0_LO, \lo
94.endm
95
96.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w
97	movdqa \s1_l, T0_LO
98	pand \s2_l, T0_LO
99	pxor T0_LO, \s0_l
100
101	movdqa \s1_h, T0_LO
102	pand \s2_h, T0_LO
103	pxor T0_LO, \s0_h
104
105	pxor \s3_l, \s0_l
106	pxor \s3_h, \s0_h
107
108	movdqa \s0_l, T0_LO
109	psllq $\b, T0_LO
110	psrlq $(64 - \b), \s0_l
111	pxor T0_LO, \s0_l
112
113	movdqa \s0_h, T0_LO
114	psllq $\b, T0_LO
115	psrlq $(64 - \b), \s0_h
116	pxor T0_LO, \s0_h
117
118	\w \s3_h, \s3_l
119.endm
120
121/*
122 * __morus1280_update: internal ABI
123 * input:
124 *   STATE[0-4] - input state
125 *   MSG        - message block
126 * output:
127 *   STATE[0-4] - output state
128 * changed:
129 *   T0
130 */
131__morus1280_update:
132	morus1280_round \
133		STATE0_LO, STATE0_HI, \
134		STATE1_LO, STATE1_HI, \
135		STATE2_LO, STATE2_HI, \
136		STATE3_LO, STATE3_HI, \
137		STATE4_LO, STATE4_HI, \
138		13, rol1
139	pxor MSG_LO, STATE1_LO
140	pxor MSG_HI, STATE1_HI
141	morus1280_round \
142		STATE1_LO, STATE1_HI, \
143		STATE2_LO, STATE2_HI, \
144		STATE3_LO, STATE3_HI, \
145		STATE4_LO, STATE4_HI, \
146		STATE0_LO, STATE0_HI, \
147		46, rol2
148	pxor MSG_LO, STATE2_LO
149	pxor MSG_HI, STATE2_HI
150	morus1280_round \
151		STATE2_LO, STATE2_HI, \
152		STATE3_LO, STATE3_HI, \
153		STATE4_LO, STATE4_HI, \
154		STATE0_LO, STATE0_HI, \
155		STATE1_LO, STATE1_HI, \
156		38, rol3
157	pxor MSG_LO, STATE3_LO
158	pxor MSG_HI, STATE3_HI
159	morus1280_round \
160		STATE3_LO, STATE3_HI, \
161		STATE4_LO, STATE4_HI, \
162		STATE0_LO, STATE0_HI, \
163		STATE1_LO, STATE1_HI, \
164		STATE2_LO, STATE2_HI, \
165		7, rol2
166	pxor MSG_LO, STATE4_LO
167	pxor MSG_HI, STATE4_HI
168	morus1280_round \
169		STATE4_LO, STATE4_HI, \
170		STATE0_LO, STATE0_HI, \
171		STATE1_LO, STATE1_HI, \
172		STATE2_LO, STATE2_HI, \
173		STATE3_LO, STATE3_HI, \
174		4, rol1
175	ret
176ENDPROC(__morus1280_update)
177
178/*
179 * __morus1280_update_zero: internal ABI
180 * input:
181 *   STATE[0-4] - input state
182 * output:
183 *   STATE[0-4] - output state
184 * changed:
185 *   T0
186 */
187__morus1280_update_zero:
188	morus1280_round \
189		STATE0_LO, STATE0_HI, \
190		STATE1_LO, STATE1_HI, \
191		STATE2_LO, STATE2_HI, \
192		STATE3_LO, STATE3_HI, \
193		STATE4_LO, STATE4_HI, \
194		13, rol1
195	morus1280_round \
196		STATE1_LO, STATE1_HI, \
197		STATE2_LO, STATE2_HI, \
198		STATE3_LO, STATE3_HI, \
199		STATE4_LO, STATE4_HI, \
200		STATE0_LO, STATE0_HI, \
201		46, rol2
202	morus1280_round \
203		STATE2_LO, STATE2_HI, \
204		STATE3_LO, STATE3_HI, \
205		STATE4_LO, STATE4_HI, \
206		STATE0_LO, STATE0_HI, \
207		STATE1_LO, STATE1_HI, \
208		38, rol3
209	morus1280_round \
210		STATE3_LO, STATE3_HI, \
211		STATE4_LO, STATE4_HI, \
212		STATE0_LO, STATE0_HI, \
213		STATE1_LO, STATE1_HI, \
214		STATE2_LO, STATE2_HI, \
215		7, rol2
216	morus1280_round \
217		STATE4_LO, STATE4_HI, \
218		STATE0_LO, STATE0_HI, \
219		STATE1_LO, STATE1_HI, \
220		STATE2_LO, STATE2_HI, \
221		STATE3_LO, STATE3_HI, \
222		4, rol1
223	ret
224ENDPROC(__morus1280_update_zero)
225
226/*
227 * __load_partial: internal ABI
228 * input:
229 *   %rsi - src
230 *   %rcx - bytes
231 * output:
232 *   MSG  - message block
233 * changed:
234 *   %r8
235 *   %r9
236 */
237__load_partial:
238	xor %r9d, %r9d
239	pxor MSG_LO, MSG_LO
240	pxor MSG_HI, MSG_HI
241
242	mov %rcx, %r8
243	and $0x1, %r8
244	jz .Lld_partial_1
245
246	mov %rcx, %r8
247	and $0x1E, %r8
248	add %rsi, %r8
249	mov (%r8), %r9b
250
251.Lld_partial_1:
252	mov %rcx, %r8
253	and $0x2, %r8
254	jz .Lld_partial_2
255
256	mov %rcx, %r8
257	and $0x1C, %r8
258	add %rsi, %r8
259	shl $16, %r9
260	mov (%r8), %r9w
261
262.Lld_partial_2:
263	mov %rcx, %r8
264	and $0x4, %r8
265	jz .Lld_partial_4
266
267	mov %rcx, %r8
268	and $0x18, %r8
269	add %rsi, %r8
270	shl $32, %r9
271	mov (%r8), %r8d
272	xor %r8, %r9
273
274.Lld_partial_4:
275	movq %r9, MSG_LO
276
277	mov %rcx, %r8
278	and $0x8, %r8
279	jz .Lld_partial_8
280
281	mov %rcx, %r8
282	and $0x10, %r8
283	add %rsi, %r8
284	pslldq $8, MSG_LO
285	movq (%r8), T0_LO
286	pxor T0_LO, MSG_LO
287
288.Lld_partial_8:
289	mov %rcx, %r8
290	and $0x10, %r8
291	jz .Lld_partial_16
292
293	movdqa MSG_LO, MSG_HI
294	movdqu (%rsi), MSG_LO
295
296.Lld_partial_16:
297	ret
298ENDPROC(__load_partial)
299
300/*
301 * __store_partial: internal ABI
302 * input:
303 *   %rdx - dst
304 *   %rcx - bytes
305 * output:
306 *   T0   - message block
307 * changed:
308 *   %r8
309 *   %r9
310 *   %r10
311 */
312__store_partial:
313	mov %rcx, %r8
314	mov %rdx, %r9
315
316	cmp $16, %r8
317	jl .Lst_partial_16
318
319	movdqu T0_LO, (%r9)
320	movdqa T0_HI, T0_LO
321
322	sub $16, %r8
323	add $16, %r9
324
325.Lst_partial_16:
326	movq T0_LO, %r10
327
328	cmp $8, %r8
329	jl .Lst_partial_8
330
331	mov %r10, (%r9)
332	psrldq $8, T0_LO
333	movq T0_LO, %r10
334
335	sub $8, %r8
336	add $8, %r9
337
338.Lst_partial_8:
339	cmp $4, %r8
340	jl .Lst_partial_4
341
342	mov %r10d, (%r9)
343	shr $32, %r10
344
345	sub $4, %r8
346	add $4, %r9
347
348.Lst_partial_4:
349	cmp $2, %r8
350	jl .Lst_partial_2
351
352	mov %r10w, (%r9)
353	shr $16, %r10
354
355	sub $2, %r8
356	add $2, %r9
357
358.Lst_partial_2:
359	cmp $1, %r8
360	jl .Lst_partial_1
361
362	mov %r10b, (%r9)
363
364.Lst_partial_1:
365	ret
366ENDPROC(__store_partial)
367
368/*
369 * void crypto_morus1280_sse2_init(void *state, const void *key,
370 *                                 const void *iv);
371 */
372ENTRY(crypto_morus1280_sse2_init)
373	FRAME_BEGIN
374
375	/* load IV: */
376	pxor STATE0_HI, STATE0_HI
377	movdqu (%rdx), STATE0_LO
378	/* load key: */
379	movdqu  0(%rsi), KEY_LO
380	movdqu 16(%rsi), KEY_HI
381	movdqa KEY_LO, STATE1_LO
382	movdqa KEY_HI, STATE1_HI
383	/* load all ones: */
384	pcmpeqd STATE2_LO, STATE2_LO
385	pcmpeqd STATE2_HI, STATE2_HI
386	/* load all zeros: */
387	pxor STATE3_LO, STATE3_LO
388	pxor STATE3_HI, STATE3_HI
389	/* load the constant: */
390	movdqa .Lmorus640_const_0, STATE4_LO
391	movdqa .Lmorus640_const_1, STATE4_HI
392
393	/* update 16 times with zero: */
394	call __morus1280_update_zero
395	call __morus1280_update_zero
396	call __morus1280_update_zero
397	call __morus1280_update_zero
398	call __morus1280_update_zero
399	call __morus1280_update_zero
400	call __morus1280_update_zero
401	call __morus1280_update_zero
402	call __morus1280_update_zero
403	call __morus1280_update_zero
404	call __morus1280_update_zero
405	call __morus1280_update_zero
406	call __morus1280_update_zero
407	call __morus1280_update_zero
408	call __morus1280_update_zero
409	call __morus1280_update_zero
410
411	/* xor-in the key again after updates: */
412	pxor KEY_LO, STATE1_LO
413	pxor KEY_HI, STATE1_HI
414
415	/* store the state: */
416	movdqu STATE0_LO, (0 * 16)(%rdi)
417	movdqu STATE0_HI, (1 * 16)(%rdi)
418	movdqu STATE1_LO, (2 * 16)(%rdi)
419	movdqu STATE1_HI, (3 * 16)(%rdi)
420	movdqu STATE2_LO, (4 * 16)(%rdi)
421	movdqu STATE2_HI, (5 * 16)(%rdi)
422	movdqu STATE3_LO, (6 * 16)(%rdi)
423	movdqu STATE3_HI, (7 * 16)(%rdi)
424	movdqu STATE4_LO, (8 * 16)(%rdi)
425	movdqu STATE4_HI, (9 * 16)(%rdi)
426
427	FRAME_END
428	ret
429ENDPROC(crypto_morus1280_sse2_init)
430
431/*
432 * void crypto_morus1280_sse2_ad(void *state, const void *data,
433 *                               unsigned int length);
434 */
435ENTRY(crypto_morus1280_sse2_ad)
436	FRAME_BEGIN
437
438	cmp $32, %rdx
439	jb .Lad_out
440
441	/* load the state: */
442	movdqu (0 * 16)(%rdi), STATE0_LO
443	movdqu (1 * 16)(%rdi), STATE0_HI
444	movdqu (2 * 16)(%rdi), STATE1_LO
445	movdqu (3 * 16)(%rdi), STATE1_HI
446	movdqu (4 * 16)(%rdi), STATE2_LO
447	movdqu (5 * 16)(%rdi), STATE2_HI
448	movdqu (6 * 16)(%rdi), STATE3_LO
449	movdqu (7 * 16)(%rdi), STATE3_HI
450	movdqu (8 * 16)(%rdi), STATE4_LO
451	movdqu (9 * 16)(%rdi), STATE4_HI
452
453	mov %rsi, %r8
454	and $0xF, %r8
455	jnz .Lad_u_loop
456
457.align 4
458.Lad_a_loop:
459	movdqa  0(%rsi), MSG_LO
460	movdqa 16(%rsi), MSG_HI
461	call __morus1280_update
462	sub $32, %rdx
463	add $32, %rsi
464	cmp $32, %rdx
465	jge .Lad_a_loop
466
467	jmp .Lad_cont
468.align 4
469.Lad_u_loop:
470	movdqu  0(%rsi), MSG_LO
471	movdqu 16(%rsi), MSG_HI
472	call __morus1280_update
473	sub $32, %rdx
474	add $32, %rsi
475	cmp $32, %rdx
476	jge .Lad_u_loop
477
478.Lad_cont:
479	/* store the state: */
480	movdqu STATE0_LO, (0 * 16)(%rdi)
481	movdqu STATE0_HI, (1 * 16)(%rdi)
482	movdqu STATE1_LO, (2 * 16)(%rdi)
483	movdqu STATE1_HI, (3 * 16)(%rdi)
484	movdqu STATE2_LO, (4 * 16)(%rdi)
485	movdqu STATE2_HI, (5 * 16)(%rdi)
486	movdqu STATE3_LO, (6 * 16)(%rdi)
487	movdqu STATE3_HI, (7 * 16)(%rdi)
488	movdqu STATE4_LO, (8 * 16)(%rdi)
489	movdqu STATE4_HI, (9 * 16)(%rdi)
490
491.Lad_out:
492	FRAME_END
493	ret
494ENDPROC(crypto_morus1280_sse2_ad)
495
496/*
497 * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst,
498 *                                unsigned int length);
499 */
500ENTRY(crypto_morus1280_sse2_enc)
501	FRAME_BEGIN
502
503	cmp $32, %rcx
504	jb .Lenc_out
505
506	/* load the state: */
507	movdqu (0 * 16)(%rdi), STATE0_LO
508	movdqu (1 * 16)(%rdi), STATE0_HI
509	movdqu (2 * 16)(%rdi), STATE1_LO
510	movdqu (3 * 16)(%rdi), STATE1_HI
511	movdqu (4 * 16)(%rdi), STATE2_LO
512	movdqu (5 * 16)(%rdi), STATE2_HI
513	movdqu (6 * 16)(%rdi), STATE3_LO
514	movdqu (7 * 16)(%rdi), STATE3_HI
515	movdqu (8 * 16)(%rdi), STATE4_LO
516	movdqu (9 * 16)(%rdi), STATE4_HI
517
518	mov %rsi, %r8
519	or  %rdx, %r8
520	and $0xF, %r8
521	jnz .Lenc_u_loop
522
523.align 4
524.Lenc_a_loop:
525	movdqa  0(%rsi), MSG_LO
526	movdqa 16(%rsi), MSG_HI
527	movdqa STATE1_LO, T1_LO
528	movdqa STATE1_HI, T1_HI
529	rol3 T1_HI, T1_LO
530	movdqa MSG_LO, T0_LO
531	movdqa MSG_HI, T0_HI
532	pxor T1_LO, T0_LO
533	pxor T1_HI, T0_HI
534	pxor STATE0_LO, T0_LO
535	pxor STATE0_HI, T0_HI
536	movdqa STATE2_LO, T1_LO
537	movdqa STATE2_HI, T1_HI
538	pand STATE3_LO, T1_LO
539	pand STATE3_HI, T1_HI
540	pxor T1_LO, T0_LO
541	pxor T1_HI, T0_HI
542	movdqa T0_LO,  0(%rdx)
543	movdqa T0_HI, 16(%rdx)
544
545	call __morus1280_update
546	sub $32, %rcx
547	add $32, %rsi
548	add $32, %rdx
549	cmp $32, %rcx
550	jge .Lenc_a_loop
551
552	jmp .Lenc_cont
553.align 4
554.Lenc_u_loop:
555	movdqu  0(%rsi), MSG_LO
556	movdqu 16(%rsi), MSG_HI
557	movdqa STATE1_LO, T1_LO
558	movdqa STATE1_HI, T1_HI
559	rol3 T1_HI, T1_LO
560	movdqa MSG_LO, T0_LO
561	movdqa MSG_HI, T0_HI
562	pxor T1_LO, T0_LO
563	pxor T1_HI, T0_HI
564	pxor STATE0_LO, T0_LO
565	pxor STATE0_HI, T0_HI
566	movdqa STATE2_LO, T1_LO
567	movdqa STATE2_HI, T1_HI
568	pand STATE3_LO, T1_LO
569	pand STATE3_HI, T1_HI
570	pxor T1_LO, T0_LO
571	pxor T1_HI, T0_HI
572	movdqu T0_LO,  0(%rdx)
573	movdqu T0_HI, 16(%rdx)
574
575	call __morus1280_update
576	sub $32, %rcx
577	add $32, %rsi
578	add $32, %rdx
579	cmp $32, %rcx
580	jge .Lenc_u_loop
581
582.Lenc_cont:
583	/* store the state: */
584	movdqu STATE0_LO, (0 * 16)(%rdi)
585	movdqu STATE0_HI, (1 * 16)(%rdi)
586	movdqu STATE1_LO, (2 * 16)(%rdi)
587	movdqu STATE1_HI, (3 * 16)(%rdi)
588	movdqu STATE2_LO, (4 * 16)(%rdi)
589	movdqu STATE2_HI, (5 * 16)(%rdi)
590	movdqu STATE3_LO, (6 * 16)(%rdi)
591	movdqu STATE3_HI, (7 * 16)(%rdi)
592	movdqu STATE4_LO, (8 * 16)(%rdi)
593	movdqu STATE4_HI, (9 * 16)(%rdi)
594
595.Lenc_out:
596	FRAME_END
597	ret
598ENDPROC(crypto_morus1280_sse2_enc)
599
600/*
601 * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst,
602 *                                     unsigned int length);
603 */
604ENTRY(crypto_morus1280_sse2_enc_tail)
605	FRAME_BEGIN
606
607	/* load the state: */
608	movdqu (0 * 16)(%rdi), STATE0_LO
609	movdqu (1 * 16)(%rdi), STATE0_HI
610	movdqu (2 * 16)(%rdi), STATE1_LO
611	movdqu (3 * 16)(%rdi), STATE1_HI
612	movdqu (4 * 16)(%rdi), STATE2_LO
613	movdqu (5 * 16)(%rdi), STATE2_HI
614	movdqu (6 * 16)(%rdi), STATE3_LO
615	movdqu (7 * 16)(%rdi), STATE3_HI
616	movdqu (8 * 16)(%rdi), STATE4_LO
617	movdqu (9 * 16)(%rdi), STATE4_HI
618
619	/* encrypt message: */
620	call __load_partial
621
622	movdqa STATE1_LO, T1_LO
623	movdqa STATE1_HI, T1_HI
624	rol3 T1_HI, T1_LO
625	movdqa MSG_LO, T0_LO
626	movdqa MSG_HI, T0_HI
627	pxor T1_LO, T0_LO
628	pxor T1_HI, T0_HI
629	pxor STATE0_LO, T0_LO
630	pxor STATE0_HI, T0_HI
631	movdqa STATE2_LO, T1_LO
632	movdqa STATE2_HI, T1_HI
633	pand STATE3_LO, T1_LO
634	pand STATE3_HI, T1_HI
635	pxor T1_LO, T0_LO
636	pxor T1_HI, T0_HI
637
638	call __store_partial
639
640	call __morus1280_update
641
642	/* store the state: */
643	movdqu STATE0_LO, (0 * 16)(%rdi)
644	movdqu STATE0_HI, (1 * 16)(%rdi)
645	movdqu STATE1_LO, (2 * 16)(%rdi)
646	movdqu STATE1_HI, (3 * 16)(%rdi)
647	movdqu STATE2_LO, (4 * 16)(%rdi)
648	movdqu STATE2_HI, (5 * 16)(%rdi)
649	movdqu STATE3_LO, (6 * 16)(%rdi)
650	movdqu STATE3_HI, (7 * 16)(%rdi)
651	movdqu STATE4_LO, (8 * 16)(%rdi)
652	movdqu STATE4_HI, (9 * 16)(%rdi)
653
654	FRAME_END
655	ret
656ENDPROC(crypto_morus1280_sse2_enc_tail)
657
658/*
659 * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst,
660 *                                unsigned int length);
661 */
662ENTRY(crypto_morus1280_sse2_dec)
663	FRAME_BEGIN
664
665	cmp $32, %rcx
666	jb .Ldec_out
667
668	/* load the state: */
669	movdqu (0 * 16)(%rdi), STATE0_LO
670	movdqu (1 * 16)(%rdi), STATE0_HI
671	movdqu (2 * 16)(%rdi), STATE1_LO
672	movdqu (3 * 16)(%rdi), STATE1_HI
673	movdqu (4 * 16)(%rdi), STATE2_LO
674	movdqu (5 * 16)(%rdi), STATE2_HI
675	movdqu (6 * 16)(%rdi), STATE3_LO
676	movdqu (7 * 16)(%rdi), STATE3_HI
677	movdqu (8 * 16)(%rdi), STATE4_LO
678	movdqu (9 * 16)(%rdi), STATE4_HI
679
680	mov %rsi, %r8
681	or  %rdx, %r8
682	and $0xF, %r8
683	jnz .Ldec_u_loop
684
685.align 4
686.Ldec_a_loop:
687	movdqa  0(%rsi), MSG_LO
688	movdqa 16(%rsi), MSG_HI
689	pxor STATE0_LO, MSG_LO
690	pxor STATE0_HI, MSG_HI
691	movdqa STATE1_LO, T1_LO
692	movdqa STATE1_HI, T1_HI
693	rol3 T1_HI, T1_LO
694	pxor T1_LO, MSG_LO
695	pxor T1_HI, MSG_HI
696	movdqa STATE2_LO, T1_LO
697	movdqa STATE2_HI, T1_HI
698	pand STATE3_LO, T1_LO
699	pand STATE3_HI, T1_HI
700	pxor T1_LO, MSG_LO
701	pxor T1_HI, MSG_HI
702	movdqa MSG_LO,  0(%rdx)
703	movdqa MSG_HI, 16(%rdx)
704
705	call __morus1280_update
706	sub $32, %rcx
707	add $32, %rsi
708	add $32, %rdx
709	cmp $32, %rcx
710	jge .Ldec_a_loop
711
712	jmp .Ldec_cont
713.align 4
714.Ldec_u_loop:
715	movdqu  0(%rsi), MSG_LO
716	movdqu 16(%rsi), MSG_HI
717	pxor STATE0_LO, MSG_LO
718	pxor STATE0_HI, MSG_HI
719	movdqa STATE1_LO, T1_LO
720	movdqa STATE1_HI, T1_HI
721	rol3 T1_HI, T1_LO
722	pxor T1_LO, MSG_LO
723	pxor T1_HI, MSG_HI
724	movdqa STATE2_LO, T1_LO
725	movdqa STATE2_HI, T1_HI
726	pand STATE3_LO, T1_LO
727	pand STATE3_HI, T1_HI
728	pxor T1_LO, MSG_LO
729	pxor T1_HI, MSG_HI
730	movdqu MSG_LO,  0(%rdx)
731	movdqu MSG_HI, 16(%rdx)
732
733	call __morus1280_update
734	sub $32, %rcx
735	add $32, %rsi
736	add $32, %rdx
737	cmp $32, %rcx
738	jge .Ldec_u_loop
739
740.Ldec_cont:
741	/* store the state: */
742	movdqu STATE0_LO, (0 * 16)(%rdi)
743	movdqu STATE0_HI, (1 * 16)(%rdi)
744	movdqu STATE1_LO, (2 * 16)(%rdi)
745	movdqu STATE1_HI, (3 * 16)(%rdi)
746	movdqu STATE2_LO, (4 * 16)(%rdi)
747	movdqu STATE2_HI, (5 * 16)(%rdi)
748	movdqu STATE3_LO, (6 * 16)(%rdi)
749	movdqu STATE3_HI, (7 * 16)(%rdi)
750	movdqu STATE4_LO, (8 * 16)(%rdi)
751	movdqu STATE4_HI, (9 * 16)(%rdi)
752
753.Ldec_out:
754	FRAME_END
755	ret
756ENDPROC(crypto_morus1280_sse2_dec)
757
758/*
759 * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst,
760 *                                     unsigned int length);
761 */
762ENTRY(crypto_morus1280_sse2_dec_tail)
763	FRAME_BEGIN
764
765	/* load the state: */
766	movdqu (0 * 16)(%rdi), STATE0_LO
767	movdqu (1 * 16)(%rdi), STATE0_HI
768	movdqu (2 * 16)(%rdi), STATE1_LO
769	movdqu (3 * 16)(%rdi), STATE1_HI
770	movdqu (4 * 16)(%rdi), STATE2_LO
771	movdqu (5 * 16)(%rdi), STATE2_HI
772	movdqu (6 * 16)(%rdi), STATE3_LO
773	movdqu (7 * 16)(%rdi), STATE3_HI
774	movdqu (8 * 16)(%rdi), STATE4_LO
775	movdqu (9 * 16)(%rdi), STATE4_HI
776
777	/* decrypt message: */
778	call __load_partial
779
780	pxor STATE0_LO, MSG_LO
781	pxor STATE0_HI, MSG_HI
782	movdqa STATE1_LO, T1_LO
783	movdqa STATE1_HI, T1_HI
784	rol3 T1_HI, T1_LO
785	pxor T1_LO, MSG_LO
786	pxor T1_HI, MSG_HI
787	movdqa STATE2_LO, T1_LO
788	movdqa STATE2_HI, T1_HI
789	pand STATE3_LO, T1_LO
790	pand STATE3_HI, T1_HI
791	pxor T1_LO, MSG_LO
792	pxor T1_HI, MSG_HI
793	movdqa MSG_LO, T0_LO
794	movdqa MSG_HI, T0_HI
795
796	call __store_partial
797
798	/* mask with byte count: */
799	movq %rcx, T0_LO
800	punpcklbw T0_LO, T0_LO
801	punpcklbw T0_LO, T0_LO
802	punpcklbw T0_LO, T0_LO
803	punpcklbw T0_LO, T0_LO
804	movdqa T0_LO, T0_HI
805	movdqa .Lmorus640_counter_0, T1_LO
806	movdqa .Lmorus640_counter_1, T1_HI
807	pcmpgtb T1_LO, T0_LO
808	pcmpgtb T1_HI, T0_HI
809	pand T0_LO, MSG_LO
810	pand T0_HI, MSG_HI
811
812	call __morus1280_update
813
814	/* store the state: */
815	movdqu STATE0_LO, (0 * 16)(%rdi)
816	movdqu STATE0_HI, (1 * 16)(%rdi)
817	movdqu STATE1_LO, (2 * 16)(%rdi)
818	movdqu STATE1_HI, (3 * 16)(%rdi)
819	movdqu STATE2_LO, (4 * 16)(%rdi)
820	movdqu STATE2_HI, (5 * 16)(%rdi)
821	movdqu STATE3_LO, (6 * 16)(%rdi)
822	movdqu STATE3_HI, (7 * 16)(%rdi)
823	movdqu STATE4_LO, (8 * 16)(%rdi)
824	movdqu STATE4_HI, (9 * 16)(%rdi)
825
826	FRAME_END
827	ret
828ENDPROC(crypto_morus1280_sse2_dec_tail)
829
830/*
831 * void crypto_morus1280_sse2_final(void *state, void *tag_xor,
832 *                                  u64 assoclen, u64 cryptlen);
833 */
834ENTRY(crypto_morus1280_sse2_final)
835	FRAME_BEGIN
836
837	/* load the state: */
838	movdqu (0 * 16)(%rdi), STATE0_LO
839	movdqu (1 * 16)(%rdi), STATE0_HI
840	movdqu (2 * 16)(%rdi), STATE1_LO
841	movdqu (3 * 16)(%rdi), STATE1_HI
842	movdqu (4 * 16)(%rdi), STATE2_LO
843	movdqu (5 * 16)(%rdi), STATE2_HI
844	movdqu (6 * 16)(%rdi), STATE3_LO
845	movdqu (7 * 16)(%rdi), STATE3_HI
846	movdqu (8 * 16)(%rdi), STATE4_LO
847	movdqu (9 * 16)(%rdi), STATE4_HI
848
849	/* xor state[0] into state[4]: */
850	pxor STATE0_LO, STATE4_LO
851	pxor STATE0_HI, STATE4_HI
852
853	/* prepare length block: */
854	movq %rdx, MSG_LO
855	movq %rcx, T0_LO
856	pslldq $8, T0_LO
857	pxor T0_LO, MSG_LO
858	psllq $3, MSG_LO /* multiply by 8 (to get bit count) */
859	pxor MSG_HI, MSG_HI
860
861	/* update state: */
862	call __morus1280_update
863	call __morus1280_update
864	call __morus1280_update
865	call __morus1280_update
866	call __morus1280_update
867	call __morus1280_update
868	call __morus1280_update
869	call __morus1280_update
870	call __morus1280_update
871	call __morus1280_update
872
873	/* xor tag: */
874	movdqu  0(%rsi), MSG_LO
875	movdqu 16(%rsi), MSG_HI
876
877	pxor STATE0_LO, MSG_LO
878	pxor STATE0_HI, MSG_HI
879	movdqa STATE1_LO, T0_LO
880	movdqa STATE1_HI, T0_HI
881	rol3 T0_HI, T0_LO
882	pxor T0_LO, MSG_LO
883	pxor T0_HI, MSG_HI
884	movdqa STATE2_LO, T0_LO
885	movdqa STATE2_HI, T0_HI
886	pand STATE3_LO, T0_LO
887	pand STATE3_HI, T0_HI
888	pxor T0_LO, MSG_LO
889	pxor T0_HI, MSG_HI
890
891	movdqu MSG_LO,  0(%rsi)
892	movdqu MSG_HI, 16(%rsi)
893
894	FRAME_END
895	ret
896ENDPROC(crypto_morus1280_sse2_final)
897