1/* 2 * SSE2 implementation of MORUS-1280 3 * 4 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> 5 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 as published 9 * by the Free Software Foundation. 10 */ 11 12#include <linux/linkage.h> 13#include <asm/frame.h> 14 15#define SHUFFLE_MASK(i0, i1, i2, i3) \ 16 (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) 17 18#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) 19 20#define STATE0_LO %xmm0 21#define STATE0_HI %xmm1 22#define STATE1_LO %xmm2 23#define STATE1_HI %xmm3 24#define STATE2_LO %xmm4 25#define STATE2_HI %xmm5 26#define STATE3_LO %xmm6 27#define STATE3_HI %xmm7 28#define STATE4_LO %xmm8 29#define STATE4_HI %xmm9 30#define KEY_LO %xmm10 31#define KEY_HI %xmm11 32#define MSG_LO %xmm10 33#define MSG_HI %xmm11 34#define T0_LO %xmm12 35#define T0_HI %xmm13 36#define T1_LO %xmm14 37#define T1_HI %xmm15 38 39.section .rodata.cst16.morus640_const, "aM", @progbits, 16 40.align 16 41.Lmorus640_const_0: 42 .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d 43 .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 44.Lmorus640_const_1: 45 .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 46 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd 47 48.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 49.align 16 50.Lmorus640_counter_0: 51 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 52 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 53.Lmorus640_counter_1: 54 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 55 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f 56 57.text 58 59.macro rol1 hi, lo 60 /* 61 * HI_1 | HI_0 || LO_1 | LO_0 62 * ==> 63 * HI_0 | HI_1 || LO_1 | LO_0 64 * ==> 65 * HI_0 | LO_1 || LO_0 | HI_1 66 */ 67 pshufd $MASK2, \hi, \hi 68 movdqa \hi, T0_LO 69 punpcklqdq \lo, T0_LO 70 punpckhqdq \hi, \lo 71 movdqa \lo, \hi 72 movdqa T0_LO, \lo 73.endm 74 75.macro rol2 hi, lo 76 movdqa \lo, T0_LO 77 movdqa \hi, \lo 78 movdqa T0_LO, \hi 79.endm 80 81.macro rol3 hi, lo 82 /* 83 * HI_1 | HI_0 || LO_1 | LO_0 84 * ==> 85 * HI_0 | HI_1 || LO_1 | LO_0 86 * ==> 87 * LO_0 | HI_1 || HI_0 | LO_1 88 */ 89 pshufd $MASK2, \hi, \hi 90 movdqa \lo, T0_LO 91 punpckhqdq \hi, T0_LO 92 punpcklqdq \lo, \hi 93 movdqa T0_LO, \lo 94.endm 95 96.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w 97 movdqa \s1_l, T0_LO 98 pand \s2_l, T0_LO 99 pxor T0_LO, \s0_l 100 101 movdqa \s1_h, T0_LO 102 pand \s2_h, T0_LO 103 pxor T0_LO, \s0_h 104 105 pxor \s3_l, \s0_l 106 pxor \s3_h, \s0_h 107 108 movdqa \s0_l, T0_LO 109 psllq $\b, T0_LO 110 psrlq $(64 - \b), \s0_l 111 pxor T0_LO, \s0_l 112 113 movdqa \s0_h, T0_LO 114 psllq $\b, T0_LO 115 psrlq $(64 - \b), \s0_h 116 pxor T0_LO, \s0_h 117 118 \w \s3_h, \s3_l 119.endm 120 121/* 122 * __morus1280_update: internal ABI 123 * input: 124 * STATE[0-4] - input state 125 * MSG - message block 126 * output: 127 * STATE[0-4] - output state 128 * changed: 129 * T0 130 */ 131__morus1280_update: 132 morus1280_round \ 133 STATE0_LO, STATE0_HI, \ 134 STATE1_LO, STATE1_HI, \ 135 STATE2_LO, STATE2_HI, \ 136 STATE3_LO, STATE3_HI, \ 137 STATE4_LO, STATE4_HI, \ 138 13, rol1 139 pxor MSG_LO, STATE1_LO 140 pxor MSG_HI, STATE1_HI 141 morus1280_round \ 142 STATE1_LO, STATE1_HI, \ 143 STATE2_LO, STATE2_HI, \ 144 STATE3_LO, STATE3_HI, \ 145 STATE4_LO, STATE4_HI, \ 146 STATE0_LO, STATE0_HI, \ 147 46, rol2 148 pxor MSG_LO, STATE2_LO 149 pxor MSG_HI, STATE2_HI 150 morus1280_round \ 151 STATE2_LO, STATE2_HI, \ 152 STATE3_LO, STATE3_HI, \ 153 STATE4_LO, STATE4_HI, \ 154 STATE0_LO, STATE0_HI, \ 155 STATE1_LO, STATE1_HI, \ 156 38, rol3 157 pxor MSG_LO, STATE3_LO 158 pxor MSG_HI, STATE3_HI 159 morus1280_round \ 160 STATE3_LO, STATE3_HI, \ 161 STATE4_LO, STATE4_HI, \ 162 STATE0_LO, STATE0_HI, \ 163 STATE1_LO, STATE1_HI, \ 164 STATE2_LO, STATE2_HI, \ 165 7, rol2 166 pxor MSG_LO, STATE4_LO 167 pxor MSG_HI, STATE4_HI 168 morus1280_round \ 169 STATE4_LO, STATE4_HI, \ 170 STATE0_LO, STATE0_HI, \ 171 STATE1_LO, STATE1_HI, \ 172 STATE2_LO, STATE2_HI, \ 173 STATE3_LO, STATE3_HI, \ 174 4, rol1 175 ret 176ENDPROC(__morus1280_update) 177 178/* 179 * __morus1280_update_zero: internal ABI 180 * input: 181 * STATE[0-4] - input state 182 * output: 183 * STATE[0-4] - output state 184 * changed: 185 * T0 186 */ 187__morus1280_update_zero: 188 morus1280_round \ 189 STATE0_LO, STATE0_HI, \ 190 STATE1_LO, STATE1_HI, \ 191 STATE2_LO, STATE2_HI, \ 192 STATE3_LO, STATE3_HI, \ 193 STATE4_LO, STATE4_HI, \ 194 13, rol1 195 morus1280_round \ 196 STATE1_LO, STATE1_HI, \ 197 STATE2_LO, STATE2_HI, \ 198 STATE3_LO, STATE3_HI, \ 199 STATE4_LO, STATE4_HI, \ 200 STATE0_LO, STATE0_HI, \ 201 46, rol2 202 morus1280_round \ 203 STATE2_LO, STATE2_HI, \ 204 STATE3_LO, STATE3_HI, \ 205 STATE4_LO, STATE4_HI, \ 206 STATE0_LO, STATE0_HI, \ 207 STATE1_LO, STATE1_HI, \ 208 38, rol3 209 morus1280_round \ 210 STATE3_LO, STATE3_HI, \ 211 STATE4_LO, STATE4_HI, \ 212 STATE0_LO, STATE0_HI, \ 213 STATE1_LO, STATE1_HI, \ 214 STATE2_LO, STATE2_HI, \ 215 7, rol2 216 morus1280_round \ 217 STATE4_LO, STATE4_HI, \ 218 STATE0_LO, STATE0_HI, \ 219 STATE1_LO, STATE1_HI, \ 220 STATE2_LO, STATE2_HI, \ 221 STATE3_LO, STATE3_HI, \ 222 4, rol1 223 ret 224ENDPROC(__morus1280_update_zero) 225 226/* 227 * __load_partial: internal ABI 228 * input: 229 * %rsi - src 230 * %rcx - bytes 231 * output: 232 * MSG - message block 233 * changed: 234 * %r8 235 * %r9 236 */ 237__load_partial: 238 xor %r9d, %r9d 239 pxor MSG_LO, MSG_LO 240 pxor MSG_HI, MSG_HI 241 242 mov %rcx, %r8 243 and $0x1, %r8 244 jz .Lld_partial_1 245 246 mov %rcx, %r8 247 and $0x1E, %r8 248 add %rsi, %r8 249 mov (%r8), %r9b 250 251.Lld_partial_1: 252 mov %rcx, %r8 253 and $0x2, %r8 254 jz .Lld_partial_2 255 256 mov %rcx, %r8 257 and $0x1C, %r8 258 add %rsi, %r8 259 shl $16, %r9 260 mov (%r8), %r9w 261 262.Lld_partial_2: 263 mov %rcx, %r8 264 and $0x4, %r8 265 jz .Lld_partial_4 266 267 mov %rcx, %r8 268 and $0x18, %r8 269 add %rsi, %r8 270 shl $32, %r9 271 mov (%r8), %r8d 272 xor %r8, %r9 273 274.Lld_partial_4: 275 movq %r9, MSG_LO 276 277 mov %rcx, %r8 278 and $0x8, %r8 279 jz .Lld_partial_8 280 281 mov %rcx, %r8 282 and $0x10, %r8 283 add %rsi, %r8 284 pslldq $8, MSG_LO 285 movq (%r8), T0_LO 286 pxor T0_LO, MSG_LO 287 288.Lld_partial_8: 289 mov %rcx, %r8 290 and $0x10, %r8 291 jz .Lld_partial_16 292 293 movdqa MSG_LO, MSG_HI 294 movdqu (%rsi), MSG_LO 295 296.Lld_partial_16: 297 ret 298ENDPROC(__load_partial) 299 300/* 301 * __store_partial: internal ABI 302 * input: 303 * %rdx - dst 304 * %rcx - bytes 305 * output: 306 * T0 - message block 307 * changed: 308 * %r8 309 * %r9 310 * %r10 311 */ 312__store_partial: 313 mov %rcx, %r8 314 mov %rdx, %r9 315 316 cmp $16, %r8 317 jl .Lst_partial_16 318 319 movdqu T0_LO, (%r9) 320 movdqa T0_HI, T0_LO 321 322 sub $16, %r8 323 add $16, %r9 324 325.Lst_partial_16: 326 movq T0_LO, %r10 327 328 cmp $8, %r8 329 jl .Lst_partial_8 330 331 mov %r10, (%r9) 332 psrldq $8, T0_LO 333 movq T0_LO, %r10 334 335 sub $8, %r8 336 add $8, %r9 337 338.Lst_partial_8: 339 cmp $4, %r8 340 jl .Lst_partial_4 341 342 mov %r10d, (%r9) 343 shr $32, %r10 344 345 sub $4, %r8 346 add $4, %r9 347 348.Lst_partial_4: 349 cmp $2, %r8 350 jl .Lst_partial_2 351 352 mov %r10w, (%r9) 353 shr $16, %r10 354 355 sub $2, %r8 356 add $2, %r9 357 358.Lst_partial_2: 359 cmp $1, %r8 360 jl .Lst_partial_1 361 362 mov %r10b, (%r9) 363 364.Lst_partial_1: 365 ret 366ENDPROC(__store_partial) 367 368/* 369 * void crypto_morus1280_sse2_init(void *state, const void *key, 370 * const void *iv); 371 */ 372ENTRY(crypto_morus1280_sse2_init) 373 FRAME_BEGIN 374 375 /* load IV: */ 376 pxor STATE0_HI, STATE0_HI 377 movdqu (%rdx), STATE0_LO 378 /* load key: */ 379 movdqu 0(%rsi), KEY_LO 380 movdqu 16(%rsi), KEY_HI 381 movdqa KEY_LO, STATE1_LO 382 movdqa KEY_HI, STATE1_HI 383 /* load all ones: */ 384 pcmpeqd STATE2_LO, STATE2_LO 385 pcmpeqd STATE2_HI, STATE2_HI 386 /* load all zeros: */ 387 pxor STATE3_LO, STATE3_LO 388 pxor STATE3_HI, STATE3_HI 389 /* load the constant: */ 390 movdqa .Lmorus640_const_0, STATE4_LO 391 movdqa .Lmorus640_const_1, STATE4_HI 392 393 /* update 16 times with zero: */ 394 call __morus1280_update_zero 395 call __morus1280_update_zero 396 call __morus1280_update_zero 397 call __morus1280_update_zero 398 call __morus1280_update_zero 399 call __morus1280_update_zero 400 call __morus1280_update_zero 401 call __morus1280_update_zero 402 call __morus1280_update_zero 403 call __morus1280_update_zero 404 call __morus1280_update_zero 405 call __morus1280_update_zero 406 call __morus1280_update_zero 407 call __morus1280_update_zero 408 call __morus1280_update_zero 409 call __morus1280_update_zero 410 411 /* xor-in the key again after updates: */ 412 pxor KEY_LO, STATE1_LO 413 pxor KEY_HI, STATE1_HI 414 415 /* store the state: */ 416 movdqu STATE0_LO, (0 * 16)(%rdi) 417 movdqu STATE0_HI, (1 * 16)(%rdi) 418 movdqu STATE1_LO, (2 * 16)(%rdi) 419 movdqu STATE1_HI, (3 * 16)(%rdi) 420 movdqu STATE2_LO, (4 * 16)(%rdi) 421 movdqu STATE2_HI, (5 * 16)(%rdi) 422 movdqu STATE3_LO, (6 * 16)(%rdi) 423 movdqu STATE3_HI, (7 * 16)(%rdi) 424 movdqu STATE4_LO, (8 * 16)(%rdi) 425 movdqu STATE4_HI, (9 * 16)(%rdi) 426 427 FRAME_END 428 ret 429ENDPROC(crypto_morus1280_sse2_init) 430 431/* 432 * void crypto_morus1280_sse2_ad(void *state, const void *data, 433 * unsigned int length); 434 */ 435ENTRY(crypto_morus1280_sse2_ad) 436 FRAME_BEGIN 437 438 cmp $32, %rdx 439 jb .Lad_out 440 441 /* load the state: */ 442 movdqu (0 * 16)(%rdi), STATE0_LO 443 movdqu (1 * 16)(%rdi), STATE0_HI 444 movdqu (2 * 16)(%rdi), STATE1_LO 445 movdqu (3 * 16)(%rdi), STATE1_HI 446 movdqu (4 * 16)(%rdi), STATE2_LO 447 movdqu (5 * 16)(%rdi), STATE2_HI 448 movdqu (6 * 16)(%rdi), STATE3_LO 449 movdqu (7 * 16)(%rdi), STATE3_HI 450 movdqu (8 * 16)(%rdi), STATE4_LO 451 movdqu (9 * 16)(%rdi), STATE4_HI 452 453 mov %rsi, %r8 454 and $0xF, %r8 455 jnz .Lad_u_loop 456 457.align 4 458.Lad_a_loop: 459 movdqa 0(%rsi), MSG_LO 460 movdqa 16(%rsi), MSG_HI 461 call __morus1280_update 462 sub $32, %rdx 463 add $32, %rsi 464 cmp $32, %rdx 465 jge .Lad_a_loop 466 467 jmp .Lad_cont 468.align 4 469.Lad_u_loop: 470 movdqu 0(%rsi), MSG_LO 471 movdqu 16(%rsi), MSG_HI 472 call __morus1280_update 473 sub $32, %rdx 474 add $32, %rsi 475 cmp $32, %rdx 476 jge .Lad_u_loop 477 478.Lad_cont: 479 /* store the state: */ 480 movdqu STATE0_LO, (0 * 16)(%rdi) 481 movdqu STATE0_HI, (1 * 16)(%rdi) 482 movdqu STATE1_LO, (2 * 16)(%rdi) 483 movdqu STATE1_HI, (3 * 16)(%rdi) 484 movdqu STATE2_LO, (4 * 16)(%rdi) 485 movdqu STATE2_HI, (5 * 16)(%rdi) 486 movdqu STATE3_LO, (6 * 16)(%rdi) 487 movdqu STATE3_HI, (7 * 16)(%rdi) 488 movdqu STATE4_LO, (8 * 16)(%rdi) 489 movdqu STATE4_HI, (9 * 16)(%rdi) 490 491.Lad_out: 492 FRAME_END 493 ret 494ENDPROC(crypto_morus1280_sse2_ad) 495 496/* 497 * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst, 498 * unsigned int length); 499 */ 500ENTRY(crypto_morus1280_sse2_enc) 501 FRAME_BEGIN 502 503 cmp $32, %rcx 504 jb .Lenc_out 505 506 /* load the state: */ 507 movdqu (0 * 16)(%rdi), STATE0_LO 508 movdqu (1 * 16)(%rdi), STATE0_HI 509 movdqu (2 * 16)(%rdi), STATE1_LO 510 movdqu (3 * 16)(%rdi), STATE1_HI 511 movdqu (4 * 16)(%rdi), STATE2_LO 512 movdqu (5 * 16)(%rdi), STATE2_HI 513 movdqu (6 * 16)(%rdi), STATE3_LO 514 movdqu (7 * 16)(%rdi), STATE3_HI 515 movdqu (8 * 16)(%rdi), STATE4_LO 516 movdqu (9 * 16)(%rdi), STATE4_HI 517 518 mov %rsi, %r8 519 or %rdx, %r8 520 and $0xF, %r8 521 jnz .Lenc_u_loop 522 523.align 4 524.Lenc_a_loop: 525 movdqa 0(%rsi), MSG_LO 526 movdqa 16(%rsi), MSG_HI 527 movdqa STATE1_LO, T1_LO 528 movdqa STATE1_HI, T1_HI 529 rol3 T1_HI, T1_LO 530 movdqa MSG_LO, T0_LO 531 movdqa MSG_HI, T0_HI 532 pxor T1_LO, T0_LO 533 pxor T1_HI, T0_HI 534 pxor STATE0_LO, T0_LO 535 pxor STATE0_HI, T0_HI 536 movdqa STATE2_LO, T1_LO 537 movdqa STATE2_HI, T1_HI 538 pand STATE3_LO, T1_LO 539 pand STATE3_HI, T1_HI 540 pxor T1_LO, T0_LO 541 pxor T1_HI, T0_HI 542 movdqa T0_LO, 0(%rdx) 543 movdqa T0_HI, 16(%rdx) 544 545 call __morus1280_update 546 sub $32, %rcx 547 add $32, %rsi 548 add $32, %rdx 549 cmp $32, %rcx 550 jge .Lenc_a_loop 551 552 jmp .Lenc_cont 553.align 4 554.Lenc_u_loop: 555 movdqu 0(%rsi), MSG_LO 556 movdqu 16(%rsi), MSG_HI 557 movdqa STATE1_LO, T1_LO 558 movdqa STATE1_HI, T1_HI 559 rol3 T1_HI, T1_LO 560 movdqa MSG_LO, T0_LO 561 movdqa MSG_HI, T0_HI 562 pxor T1_LO, T0_LO 563 pxor T1_HI, T0_HI 564 pxor STATE0_LO, T0_LO 565 pxor STATE0_HI, T0_HI 566 movdqa STATE2_LO, T1_LO 567 movdqa STATE2_HI, T1_HI 568 pand STATE3_LO, T1_LO 569 pand STATE3_HI, T1_HI 570 pxor T1_LO, T0_LO 571 pxor T1_HI, T0_HI 572 movdqu T0_LO, 0(%rdx) 573 movdqu T0_HI, 16(%rdx) 574 575 call __morus1280_update 576 sub $32, %rcx 577 add $32, %rsi 578 add $32, %rdx 579 cmp $32, %rcx 580 jge .Lenc_u_loop 581 582.Lenc_cont: 583 /* store the state: */ 584 movdqu STATE0_LO, (0 * 16)(%rdi) 585 movdqu STATE0_HI, (1 * 16)(%rdi) 586 movdqu STATE1_LO, (2 * 16)(%rdi) 587 movdqu STATE1_HI, (3 * 16)(%rdi) 588 movdqu STATE2_LO, (4 * 16)(%rdi) 589 movdqu STATE2_HI, (5 * 16)(%rdi) 590 movdqu STATE3_LO, (6 * 16)(%rdi) 591 movdqu STATE3_HI, (7 * 16)(%rdi) 592 movdqu STATE4_LO, (8 * 16)(%rdi) 593 movdqu STATE4_HI, (9 * 16)(%rdi) 594 595.Lenc_out: 596 FRAME_END 597 ret 598ENDPROC(crypto_morus1280_sse2_enc) 599 600/* 601 * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst, 602 * unsigned int length); 603 */ 604ENTRY(crypto_morus1280_sse2_enc_tail) 605 FRAME_BEGIN 606 607 /* load the state: */ 608 movdqu (0 * 16)(%rdi), STATE0_LO 609 movdqu (1 * 16)(%rdi), STATE0_HI 610 movdqu (2 * 16)(%rdi), STATE1_LO 611 movdqu (3 * 16)(%rdi), STATE1_HI 612 movdqu (4 * 16)(%rdi), STATE2_LO 613 movdqu (5 * 16)(%rdi), STATE2_HI 614 movdqu (6 * 16)(%rdi), STATE3_LO 615 movdqu (7 * 16)(%rdi), STATE3_HI 616 movdqu (8 * 16)(%rdi), STATE4_LO 617 movdqu (9 * 16)(%rdi), STATE4_HI 618 619 /* encrypt message: */ 620 call __load_partial 621 622 movdqa STATE1_LO, T1_LO 623 movdqa STATE1_HI, T1_HI 624 rol3 T1_HI, T1_LO 625 movdqa MSG_LO, T0_LO 626 movdqa MSG_HI, T0_HI 627 pxor T1_LO, T0_LO 628 pxor T1_HI, T0_HI 629 pxor STATE0_LO, T0_LO 630 pxor STATE0_HI, T0_HI 631 movdqa STATE2_LO, T1_LO 632 movdqa STATE2_HI, T1_HI 633 pand STATE3_LO, T1_LO 634 pand STATE3_HI, T1_HI 635 pxor T1_LO, T0_LO 636 pxor T1_HI, T0_HI 637 638 call __store_partial 639 640 call __morus1280_update 641 642 /* store the state: */ 643 movdqu STATE0_LO, (0 * 16)(%rdi) 644 movdqu STATE0_HI, (1 * 16)(%rdi) 645 movdqu STATE1_LO, (2 * 16)(%rdi) 646 movdqu STATE1_HI, (3 * 16)(%rdi) 647 movdqu STATE2_LO, (4 * 16)(%rdi) 648 movdqu STATE2_HI, (5 * 16)(%rdi) 649 movdqu STATE3_LO, (6 * 16)(%rdi) 650 movdqu STATE3_HI, (7 * 16)(%rdi) 651 movdqu STATE4_LO, (8 * 16)(%rdi) 652 movdqu STATE4_HI, (9 * 16)(%rdi) 653 654 FRAME_END 655 ret 656ENDPROC(crypto_morus1280_sse2_enc_tail) 657 658/* 659 * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst, 660 * unsigned int length); 661 */ 662ENTRY(crypto_morus1280_sse2_dec) 663 FRAME_BEGIN 664 665 cmp $32, %rcx 666 jb .Ldec_out 667 668 /* load the state: */ 669 movdqu (0 * 16)(%rdi), STATE0_LO 670 movdqu (1 * 16)(%rdi), STATE0_HI 671 movdqu (2 * 16)(%rdi), STATE1_LO 672 movdqu (3 * 16)(%rdi), STATE1_HI 673 movdqu (4 * 16)(%rdi), STATE2_LO 674 movdqu (5 * 16)(%rdi), STATE2_HI 675 movdqu (6 * 16)(%rdi), STATE3_LO 676 movdqu (7 * 16)(%rdi), STATE3_HI 677 movdqu (8 * 16)(%rdi), STATE4_LO 678 movdqu (9 * 16)(%rdi), STATE4_HI 679 680 mov %rsi, %r8 681 or %rdx, %r8 682 and $0xF, %r8 683 jnz .Ldec_u_loop 684 685.align 4 686.Ldec_a_loop: 687 movdqa 0(%rsi), MSG_LO 688 movdqa 16(%rsi), MSG_HI 689 pxor STATE0_LO, MSG_LO 690 pxor STATE0_HI, MSG_HI 691 movdqa STATE1_LO, T1_LO 692 movdqa STATE1_HI, T1_HI 693 rol3 T1_HI, T1_LO 694 pxor T1_LO, MSG_LO 695 pxor T1_HI, MSG_HI 696 movdqa STATE2_LO, T1_LO 697 movdqa STATE2_HI, T1_HI 698 pand STATE3_LO, T1_LO 699 pand STATE3_HI, T1_HI 700 pxor T1_LO, MSG_LO 701 pxor T1_HI, MSG_HI 702 movdqa MSG_LO, 0(%rdx) 703 movdqa MSG_HI, 16(%rdx) 704 705 call __morus1280_update 706 sub $32, %rcx 707 add $32, %rsi 708 add $32, %rdx 709 cmp $32, %rcx 710 jge .Ldec_a_loop 711 712 jmp .Ldec_cont 713.align 4 714.Ldec_u_loop: 715 movdqu 0(%rsi), MSG_LO 716 movdqu 16(%rsi), MSG_HI 717 pxor STATE0_LO, MSG_LO 718 pxor STATE0_HI, MSG_HI 719 movdqa STATE1_LO, T1_LO 720 movdqa STATE1_HI, T1_HI 721 rol3 T1_HI, T1_LO 722 pxor T1_LO, MSG_LO 723 pxor T1_HI, MSG_HI 724 movdqa STATE2_LO, T1_LO 725 movdqa STATE2_HI, T1_HI 726 pand STATE3_LO, T1_LO 727 pand STATE3_HI, T1_HI 728 pxor T1_LO, MSG_LO 729 pxor T1_HI, MSG_HI 730 movdqu MSG_LO, 0(%rdx) 731 movdqu MSG_HI, 16(%rdx) 732 733 call __morus1280_update 734 sub $32, %rcx 735 add $32, %rsi 736 add $32, %rdx 737 cmp $32, %rcx 738 jge .Ldec_u_loop 739 740.Ldec_cont: 741 /* store the state: */ 742 movdqu STATE0_LO, (0 * 16)(%rdi) 743 movdqu STATE0_HI, (1 * 16)(%rdi) 744 movdqu STATE1_LO, (2 * 16)(%rdi) 745 movdqu STATE1_HI, (3 * 16)(%rdi) 746 movdqu STATE2_LO, (4 * 16)(%rdi) 747 movdqu STATE2_HI, (5 * 16)(%rdi) 748 movdqu STATE3_LO, (6 * 16)(%rdi) 749 movdqu STATE3_HI, (7 * 16)(%rdi) 750 movdqu STATE4_LO, (8 * 16)(%rdi) 751 movdqu STATE4_HI, (9 * 16)(%rdi) 752 753.Ldec_out: 754 FRAME_END 755 ret 756ENDPROC(crypto_morus1280_sse2_dec) 757 758/* 759 * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst, 760 * unsigned int length); 761 */ 762ENTRY(crypto_morus1280_sse2_dec_tail) 763 FRAME_BEGIN 764 765 /* load the state: */ 766 movdqu (0 * 16)(%rdi), STATE0_LO 767 movdqu (1 * 16)(%rdi), STATE0_HI 768 movdqu (2 * 16)(%rdi), STATE1_LO 769 movdqu (3 * 16)(%rdi), STATE1_HI 770 movdqu (4 * 16)(%rdi), STATE2_LO 771 movdqu (5 * 16)(%rdi), STATE2_HI 772 movdqu (6 * 16)(%rdi), STATE3_LO 773 movdqu (7 * 16)(%rdi), STATE3_HI 774 movdqu (8 * 16)(%rdi), STATE4_LO 775 movdqu (9 * 16)(%rdi), STATE4_HI 776 777 /* decrypt message: */ 778 call __load_partial 779 780 pxor STATE0_LO, MSG_LO 781 pxor STATE0_HI, MSG_HI 782 movdqa STATE1_LO, T1_LO 783 movdqa STATE1_HI, T1_HI 784 rol3 T1_HI, T1_LO 785 pxor T1_LO, MSG_LO 786 pxor T1_HI, MSG_HI 787 movdqa STATE2_LO, T1_LO 788 movdqa STATE2_HI, T1_HI 789 pand STATE3_LO, T1_LO 790 pand STATE3_HI, T1_HI 791 pxor T1_LO, MSG_LO 792 pxor T1_HI, MSG_HI 793 movdqa MSG_LO, T0_LO 794 movdqa MSG_HI, T0_HI 795 796 call __store_partial 797 798 /* mask with byte count: */ 799 movq %rcx, T0_LO 800 punpcklbw T0_LO, T0_LO 801 punpcklbw T0_LO, T0_LO 802 punpcklbw T0_LO, T0_LO 803 punpcklbw T0_LO, T0_LO 804 movdqa T0_LO, T0_HI 805 movdqa .Lmorus640_counter_0, T1_LO 806 movdqa .Lmorus640_counter_1, T1_HI 807 pcmpgtb T1_LO, T0_LO 808 pcmpgtb T1_HI, T0_HI 809 pand T0_LO, MSG_LO 810 pand T0_HI, MSG_HI 811 812 call __morus1280_update 813 814 /* store the state: */ 815 movdqu STATE0_LO, (0 * 16)(%rdi) 816 movdqu STATE0_HI, (1 * 16)(%rdi) 817 movdqu STATE1_LO, (2 * 16)(%rdi) 818 movdqu STATE1_HI, (3 * 16)(%rdi) 819 movdqu STATE2_LO, (4 * 16)(%rdi) 820 movdqu STATE2_HI, (5 * 16)(%rdi) 821 movdqu STATE3_LO, (6 * 16)(%rdi) 822 movdqu STATE3_HI, (7 * 16)(%rdi) 823 movdqu STATE4_LO, (8 * 16)(%rdi) 824 movdqu STATE4_HI, (9 * 16)(%rdi) 825 826 FRAME_END 827 ret 828ENDPROC(crypto_morus1280_sse2_dec_tail) 829 830/* 831 * void crypto_morus1280_sse2_final(void *state, void *tag_xor, 832 * u64 assoclen, u64 cryptlen); 833 */ 834ENTRY(crypto_morus1280_sse2_final) 835 FRAME_BEGIN 836 837 /* load the state: */ 838 movdqu (0 * 16)(%rdi), STATE0_LO 839 movdqu (1 * 16)(%rdi), STATE0_HI 840 movdqu (2 * 16)(%rdi), STATE1_LO 841 movdqu (3 * 16)(%rdi), STATE1_HI 842 movdqu (4 * 16)(%rdi), STATE2_LO 843 movdqu (5 * 16)(%rdi), STATE2_HI 844 movdqu (6 * 16)(%rdi), STATE3_LO 845 movdqu (7 * 16)(%rdi), STATE3_HI 846 movdqu (8 * 16)(%rdi), STATE4_LO 847 movdqu (9 * 16)(%rdi), STATE4_HI 848 849 /* xor state[0] into state[4]: */ 850 pxor STATE0_LO, STATE4_LO 851 pxor STATE0_HI, STATE4_HI 852 853 /* prepare length block: */ 854 movq %rdx, MSG_LO 855 movq %rcx, T0_LO 856 pslldq $8, T0_LO 857 pxor T0_LO, MSG_LO 858 psllq $3, MSG_LO /* multiply by 8 (to get bit count) */ 859 pxor MSG_HI, MSG_HI 860 861 /* update state: */ 862 call __morus1280_update 863 call __morus1280_update 864 call __morus1280_update 865 call __morus1280_update 866 call __morus1280_update 867 call __morus1280_update 868 call __morus1280_update 869 call __morus1280_update 870 call __morus1280_update 871 call __morus1280_update 872 873 /* xor tag: */ 874 movdqu 0(%rsi), MSG_LO 875 movdqu 16(%rsi), MSG_HI 876 877 pxor STATE0_LO, MSG_LO 878 pxor STATE0_HI, MSG_HI 879 movdqa STATE1_LO, T0_LO 880 movdqa STATE1_HI, T0_HI 881 rol3 T0_HI, T0_LO 882 pxor T0_LO, MSG_LO 883 pxor T0_HI, MSG_HI 884 movdqa STATE2_LO, T0_LO 885 movdqa STATE2_HI, T0_HI 886 pand STATE3_LO, T0_LO 887 pand STATE3_HI, T0_HI 888 pxor T0_LO, MSG_LO 889 pxor T0_HI, MSG_HI 890 891 movdqu MSG_LO, 0(%rsi) 892 movdqu MSG_HI, 16(%rsi) 893 894 FRAME_END 895 ret 896ENDPROC(crypto_morus1280_sse2_final) 897