1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions 4 * 5 * Copyright (C) 2015 Martin Willi 6 */ 7 8#include <linux/linkage.h> 9#include <asm/frame.h> 10 11.section .rodata.cst16.ROT8, "aM", @progbits, 16 12.align 16 13ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 14.section .rodata.cst16.ROT16, "aM", @progbits, 16 15.align 16 16ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 17.section .rodata.cst16.CTRINC, "aM", @progbits, 16 18.align 16 19CTRINC: .octa 0x00000003000000020000000100000000 20 21.text 22 23/* 24 * chacha_permute - permute one block 25 * 26 * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This 27 * function performs matrix operations on four words in parallel, but requires 28 * shuffling to rearrange the words after each round. 8/16-bit word rotation is 29 * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word 30 * rotation uses traditional shift+OR. 31 * 32 * The round count is given in %r8d. 33 * 34 * Clobbers: %r8d, %xmm4-%xmm7 35 */ 36chacha_permute: 37 38 movdqa ROT8(%rip),%xmm4 39 movdqa ROT16(%rip),%xmm5 40 41.Ldoubleround: 42 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 43 paddd %xmm1,%xmm0 44 pxor %xmm0,%xmm3 45 pshufb %xmm5,%xmm3 46 47 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 48 paddd %xmm3,%xmm2 49 pxor %xmm2,%xmm1 50 movdqa %xmm1,%xmm6 51 pslld $12,%xmm6 52 psrld $20,%xmm1 53 por %xmm6,%xmm1 54 55 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 56 paddd %xmm1,%xmm0 57 pxor %xmm0,%xmm3 58 pshufb %xmm4,%xmm3 59 60 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 61 paddd %xmm3,%xmm2 62 pxor %xmm2,%xmm1 63 movdqa %xmm1,%xmm7 64 pslld $7,%xmm7 65 psrld $25,%xmm1 66 por %xmm7,%xmm1 67 68 # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 69 pshufd $0x39,%xmm1,%xmm1 70 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 71 pshufd $0x4e,%xmm2,%xmm2 72 # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 73 pshufd $0x93,%xmm3,%xmm3 74 75 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 76 paddd %xmm1,%xmm0 77 pxor %xmm0,%xmm3 78 pshufb %xmm5,%xmm3 79 80 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 81 paddd %xmm3,%xmm2 82 pxor %xmm2,%xmm1 83 movdqa %xmm1,%xmm6 84 pslld $12,%xmm6 85 psrld $20,%xmm1 86 por %xmm6,%xmm1 87 88 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 89 paddd %xmm1,%xmm0 90 pxor %xmm0,%xmm3 91 pshufb %xmm4,%xmm3 92 93 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 94 paddd %xmm3,%xmm2 95 pxor %xmm2,%xmm1 96 movdqa %xmm1,%xmm7 97 pslld $7,%xmm7 98 psrld $25,%xmm1 99 por %xmm7,%xmm1 100 101 # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 102 pshufd $0x93,%xmm1,%xmm1 103 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 104 pshufd $0x4e,%xmm2,%xmm2 105 # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 106 pshufd $0x39,%xmm3,%xmm3 107 108 sub $2,%r8d 109 jnz .Ldoubleround 110 111 ret 112ENDPROC(chacha_permute) 113 114ENTRY(chacha_block_xor_ssse3) 115 # %rdi: Input state matrix, s 116 # %rsi: up to 1 data block output, o 117 # %rdx: up to 1 data block input, i 118 # %rcx: input/output length in bytes 119 # %r8d: nrounds 120 FRAME_BEGIN 121 122 # x0..3 = s0..3 123 movdqa 0x00(%rdi),%xmm0 124 movdqa 0x10(%rdi),%xmm1 125 movdqa 0x20(%rdi),%xmm2 126 movdqa 0x30(%rdi),%xmm3 127 movdqa %xmm0,%xmm8 128 movdqa %xmm1,%xmm9 129 movdqa %xmm2,%xmm10 130 movdqa %xmm3,%xmm11 131 132 mov %rcx,%rax 133 call chacha_permute 134 135 # o0 = i0 ^ (x0 + s0) 136 paddd %xmm8,%xmm0 137 cmp $0x10,%rax 138 jl .Lxorpart 139 movdqu 0x00(%rdx),%xmm4 140 pxor %xmm4,%xmm0 141 movdqu %xmm0,0x00(%rsi) 142 # o1 = i1 ^ (x1 + s1) 143 paddd %xmm9,%xmm1 144 movdqa %xmm1,%xmm0 145 cmp $0x20,%rax 146 jl .Lxorpart 147 movdqu 0x10(%rdx),%xmm0 148 pxor %xmm1,%xmm0 149 movdqu %xmm0,0x10(%rsi) 150 # o2 = i2 ^ (x2 + s2) 151 paddd %xmm10,%xmm2 152 movdqa %xmm2,%xmm0 153 cmp $0x30,%rax 154 jl .Lxorpart 155 movdqu 0x20(%rdx),%xmm0 156 pxor %xmm2,%xmm0 157 movdqu %xmm0,0x20(%rsi) 158 # o3 = i3 ^ (x3 + s3) 159 paddd %xmm11,%xmm3 160 movdqa %xmm3,%xmm0 161 cmp $0x40,%rax 162 jl .Lxorpart 163 movdqu 0x30(%rdx),%xmm0 164 pxor %xmm3,%xmm0 165 movdqu %xmm0,0x30(%rsi) 166 167.Ldone: 168 FRAME_END 169 ret 170 171.Lxorpart: 172 # xor remaining bytes from partial register into output 173 mov %rax,%r9 174 and $0x0f,%r9 175 jz .Ldone 176 and $~0x0f,%rax 177 178 mov %rsi,%r11 179 180 lea 8(%rsp),%r10 181 sub $0x10,%rsp 182 and $~31,%rsp 183 184 lea (%rdx,%rax),%rsi 185 mov %rsp,%rdi 186 mov %r9,%rcx 187 rep movsb 188 189 pxor 0x00(%rsp),%xmm0 190 movdqa %xmm0,0x00(%rsp) 191 192 mov %rsp,%rsi 193 lea (%r11,%rax),%rdi 194 mov %r9,%rcx 195 rep movsb 196 197 lea -8(%r10),%rsp 198 jmp .Ldone 199 200ENDPROC(chacha_block_xor_ssse3) 201 202ENTRY(hchacha_block_ssse3) 203 # %rdi: Input state matrix, s 204 # %rsi: output (8 32-bit words) 205 # %edx: nrounds 206 FRAME_BEGIN 207 208 movdqa 0x00(%rdi),%xmm0 209 movdqa 0x10(%rdi),%xmm1 210 movdqa 0x20(%rdi),%xmm2 211 movdqa 0x30(%rdi),%xmm3 212 213 mov %edx,%r8d 214 call chacha_permute 215 216 movdqu %xmm0,0x00(%rsi) 217 movdqu %xmm3,0x10(%rsi) 218 219 FRAME_END 220 ret 221ENDPROC(hchacha_block_ssse3) 222 223ENTRY(chacha_4block_xor_ssse3) 224 # %rdi: Input state matrix, s 225 # %rsi: up to 4 data blocks output, o 226 # %rdx: up to 4 data blocks input, i 227 # %rcx: input/output length in bytes 228 # %r8d: nrounds 229 230 # This function encrypts four consecutive ChaCha blocks by loading the 231 # the state matrix in SSE registers four times. As we need some scratch 232 # registers, we save the first four registers on the stack. The 233 # algorithm performs each operation on the corresponding word of each 234 # state matrix, hence requires no word shuffling. For final XORing step 235 # we transpose the matrix by interleaving 32- and then 64-bit words, 236 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is 237 # done with the slightly better performing SSSE3 byte shuffling, 238 # 7/12-bit word rotation uses traditional shift+OR. 239 240 lea 8(%rsp),%r10 241 sub $0x80,%rsp 242 and $~63,%rsp 243 mov %rcx,%rax 244 245 # x0..15[0-3] = s0..3[0..3] 246 movq 0x00(%rdi),%xmm1 247 pshufd $0x00,%xmm1,%xmm0 248 pshufd $0x55,%xmm1,%xmm1 249 movq 0x08(%rdi),%xmm3 250 pshufd $0x00,%xmm3,%xmm2 251 pshufd $0x55,%xmm3,%xmm3 252 movq 0x10(%rdi),%xmm5 253 pshufd $0x00,%xmm5,%xmm4 254 pshufd $0x55,%xmm5,%xmm5 255 movq 0x18(%rdi),%xmm7 256 pshufd $0x00,%xmm7,%xmm6 257 pshufd $0x55,%xmm7,%xmm7 258 movq 0x20(%rdi),%xmm9 259 pshufd $0x00,%xmm9,%xmm8 260 pshufd $0x55,%xmm9,%xmm9 261 movq 0x28(%rdi),%xmm11 262 pshufd $0x00,%xmm11,%xmm10 263 pshufd $0x55,%xmm11,%xmm11 264 movq 0x30(%rdi),%xmm13 265 pshufd $0x00,%xmm13,%xmm12 266 pshufd $0x55,%xmm13,%xmm13 267 movq 0x38(%rdi),%xmm15 268 pshufd $0x00,%xmm15,%xmm14 269 pshufd $0x55,%xmm15,%xmm15 270 # x0..3 on stack 271 movdqa %xmm0,0x00(%rsp) 272 movdqa %xmm1,0x10(%rsp) 273 movdqa %xmm2,0x20(%rsp) 274 movdqa %xmm3,0x30(%rsp) 275 276 movdqa CTRINC(%rip),%xmm1 277 movdqa ROT8(%rip),%xmm2 278 movdqa ROT16(%rip),%xmm3 279 280 # x12 += counter values 0-3 281 paddd %xmm1,%xmm12 282 283.Ldoubleround4: 284 # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 285 movdqa 0x00(%rsp),%xmm0 286 paddd %xmm4,%xmm0 287 movdqa %xmm0,0x00(%rsp) 288 pxor %xmm0,%xmm12 289 pshufb %xmm3,%xmm12 290 # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 291 movdqa 0x10(%rsp),%xmm0 292 paddd %xmm5,%xmm0 293 movdqa %xmm0,0x10(%rsp) 294 pxor %xmm0,%xmm13 295 pshufb %xmm3,%xmm13 296 # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 297 movdqa 0x20(%rsp),%xmm0 298 paddd %xmm6,%xmm0 299 movdqa %xmm0,0x20(%rsp) 300 pxor %xmm0,%xmm14 301 pshufb %xmm3,%xmm14 302 # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 303 movdqa 0x30(%rsp),%xmm0 304 paddd %xmm7,%xmm0 305 movdqa %xmm0,0x30(%rsp) 306 pxor %xmm0,%xmm15 307 pshufb %xmm3,%xmm15 308 309 # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 310 paddd %xmm12,%xmm8 311 pxor %xmm8,%xmm4 312 movdqa %xmm4,%xmm0 313 pslld $12,%xmm0 314 psrld $20,%xmm4 315 por %xmm0,%xmm4 316 # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 317 paddd %xmm13,%xmm9 318 pxor %xmm9,%xmm5 319 movdqa %xmm5,%xmm0 320 pslld $12,%xmm0 321 psrld $20,%xmm5 322 por %xmm0,%xmm5 323 # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 324 paddd %xmm14,%xmm10 325 pxor %xmm10,%xmm6 326 movdqa %xmm6,%xmm0 327 pslld $12,%xmm0 328 psrld $20,%xmm6 329 por %xmm0,%xmm6 330 # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 331 paddd %xmm15,%xmm11 332 pxor %xmm11,%xmm7 333 movdqa %xmm7,%xmm0 334 pslld $12,%xmm0 335 psrld $20,%xmm7 336 por %xmm0,%xmm7 337 338 # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 339 movdqa 0x00(%rsp),%xmm0 340 paddd %xmm4,%xmm0 341 movdqa %xmm0,0x00(%rsp) 342 pxor %xmm0,%xmm12 343 pshufb %xmm2,%xmm12 344 # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 345 movdqa 0x10(%rsp),%xmm0 346 paddd %xmm5,%xmm0 347 movdqa %xmm0,0x10(%rsp) 348 pxor %xmm0,%xmm13 349 pshufb %xmm2,%xmm13 350 # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 351 movdqa 0x20(%rsp),%xmm0 352 paddd %xmm6,%xmm0 353 movdqa %xmm0,0x20(%rsp) 354 pxor %xmm0,%xmm14 355 pshufb %xmm2,%xmm14 356 # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 357 movdqa 0x30(%rsp),%xmm0 358 paddd %xmm7,%xmm0 359 movdqa %xmm0,0x30(%rsp) 360 pxor %xmm0,%xmm15 361 pshufb %xmm2,%xmm15 362 363 # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 364 paddd %xmm12,%xmm8 365 pxor %xmm8,%xmm4 366 movdqa %xmm4,%xmm0 367 pslld $7,%xmm0 368 psrld $25,%xmm4 369 por %xmm0,%xmm4 370 # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 371 paddd %xmm13,%xmm9 372 pxor %xmm9,%xmm5 373 movdqa %xmm5,%xmm0 374 pslld $7,%xmm0 375 psrld $25,%xmm5 376 por %xmm0,%xmm5 377 # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 378 paddd %xmm14,%xmm10 379 pxor %xmm10,%xmm6 380 movdqa %xmm6,%xmm0 381 pslld $7,%xmm0 382 psrld $25,%xmm6 383 por %xmm0,%xmm6 384 # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 385 paddd %xmm15,%xmm11 386 pxor %xmm11,%xmm7 387 movdqa %xmm7,%xmm0 388 pslld $7,%xmm0 389 psrld $25,%xmm7 390 por %xmm0,%xmm7 391 392 # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 393 movdqa 0x00(%rsp),%xmm0 394 paddd %xmm5,%xmm0 395 movdqa %xmm0,0x00(%rsp) 396 pxor %xmm0,%xmm15 397 pshufb %xmm3,%xmm15 398 # x1 += x6, x12 = rotl32(x12 ^ x1, 16) 399 movdqa 0x10(%rsp),%xmm0 400 paddd %xmm6,%xmm0 401 movdqa %xmm0,0x10(%rsp) 402 pxor %xmm0,%xmm12 403 pshufb %xmm3,%xmm12 404 # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 405 movdqa 0x20(%rsp),%xmm0 406 paddd %xmm7,%xmm0 407 movdqa %xmm0,0x20(%rsp) 408 pxor %xmm0,%xmm13 409 pshufb %xmm3,%xmm13 410 # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 411 movdqa 0x30(%rsp),%xmm0 412 paddd %xmm4,%xmm0 413 movdqa %xmm0,0x30(%rsp) 414 pxor %xmm0,%xmm14 415 pshufb %xmm3,%xmm14 416 417 # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 418 paddd %xmm15,%xmm10 419 pxor %xmm10,%xmm5 420 movdqa %xmm5,%xmm0 421 pslld $12,%xmm0 422 psrld $20,%xmm5 423 por %xmm0,%xmm5 424 # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 425 paddd %xmm12,%xmm11 426 pxor %xmm11,%xmm6 427 movdqa %xmm6,%xmm0 428 pslld $12,%xmm0 429 psrld $20,%xmm6 430 por %xmm0,%xmm6 431 # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 432 paddd %xmm13,%xmm8 433 pxor %xmm8,%xmm7 434 movdqa %xmm7,%xmm0 435 pslld $12,%xmm0 436 psrld $20,%xmm7 437 por %xmm0,%xmm7 438 # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 439 paddd %xmm14,%xmm9 440 pxor %xmm9,%xmm4 441 movdqa %xmm4,%xmm0 442 pslld $12,%xmm0 443 psrld $20,%xmm4 444 por %xmm0,%xmm4 445 446 # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 447 movdqa 0x00(%rsp),%xmm0 448 paddd %xmm5,%xmm0 449 movdqa %xmm0,0x00(%rsp) 450 pxor %xmm0,%xmm15 451 pshufb %xmm2,%xmm15 452 # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 453 movdqa 0x10(%rsp),%xmm0 454 paddd %xmm6,%xmm0 455 movdqa %xmm0,0x10(%rsp) 456 pxor %xmm0,%xmm12 457 pshufb %xmm2,%xmm12 458 # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 459 movdqa 0x20(%rsp),%xmm0 460 paddd %xmm7,%xmm0 461 movdqa %xmm0,0x20(%rsp) 462 pxor %xmm0,%xmm13 463 pshufb %xmm2,%xmm13 464 # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 465 movdqa 0x30(%rsp),%xmm0 466 paddd %xmm4,%xmm0 467 movdqa %xmm0,0x30(%rsp) 468 pxor %xmm0,%xmm14 469 pshufb %xmm2,%xmm14 470 471 # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 472 paddd %xmm15,%xmm10 473 pxor %xmm10,%xmm5 474 movdqa %xmm5,%xmm0 475 pslld $7,%xmm0 476 psrld $25,%xmm5 477 por %xmm0,%xmm5 478 # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 479 paddd %xmm12,%xmm11 480 pxor %xmm11,%xmm6 481 movdqa %xmm6,%xmm0 482 pslld $7,%xmm0 483 psrld $25,%xmm6 484 por %xmm0,%xmm6 485 # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 486 paddd %xmm13,%xmm8 487 pxor %xmm8,%xmm7 488 movdqa %xmm7,%xmm0 489 pslld $7,%xmm0 490 psrld $25,%xmm7 491 por %xmm0,%xmm7 492 # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 493 paddd %xmm14,%xmm9 494 pxor %xmm9,%xmm4 495 movdqa %xmm4,%xmm0 496 pslld $7,%xmm0 497 psrld $25,%xmm4 498 por %xmm0,%xmm4 499 500 sub $2,%r8d 501 jnz .Ldoubleround4 502 503 # x0[0-3] += s0[0] 504 # x1[0-3] += s0[1] 505 movq 0x00(%rdi),%xmm3 506 pshufd $0x00,%xmm3,%xmm2 507 pshufd $0x55,%xmm3,%xmm3 508 paddd 0x00(%rsp),%xmm2 509 movdqa %xmm2,0x00(%rsp) 510 paddd 0x10(%rsp),%xmm3 511 movdqa %xmm3,0x10(%rsp) 512 # x2[0-3] += s0[2] 513 # x3[0-3] += s0[3] 514 movq 0x08(%rdi),%xmm3 515 pshufd $0x00,%xmm3,%xmm2 516 pshufd $0x55,%xmm3,%xmm3 517 paddd 0x20(%rsp),%xmm2 518 movdqa %xmm2,0x20(%rsp) 519 paddd 0x30(%rsp),%xmm3 520 movdqa %xmm3,0x30(%rsp) 521 522 # x4[0-3] += s1[0] 523 # x5[0-3] += s1[1] 524 movq 0x10(%rdi),%xmm3 525 pshufd $0x00,%xmm3,%xmm2 526 pshufd $0x55,%xmm3,%xmm3 527 paddd %xmm2,%xmm4 528 paddd %xmm3,%xmm5 529 # x6[0-3] += s1[2] 530 # x7[0-3] += s1[3] 531 movq 0x18(%rdi),%xmm3 532 pshufd $0x00,%xmm3,%xmm2 533 pshufd $0x55,%xmm3,%xmm3 534 paddd %xmm2,%xmm6 535 paddd %xmm3,%xmm7 536 537 # x8[0-3] += s2[0] 538 # x9[0-3] += s2[1] 539 movq 0x20(%rdi),%xmm3 540 pshufd $0x00,%xmm3,%xmm2 541 pshufd $0x55,%xmm3,%xmm3 542 paddd %xmm2,%xmm8 543 paddd %xmm3,%xmm9 544 # x10[0-3] += s2[2] 545 # x11[0-3] += s2[3] 546 movq 0x28(%rdi),%xmm3 547 pshufd $0x00,%xmm3,%xmm2 548 pshufd $0x55,%xmm3,%xmm3 549 paddd %xmm2,%xmm10 550 paddd %xmm3,%xmm11 551 552 # x12[0-3] += s3[0] 553 # x13[0-3] += s3[1] 554 movq 0x30(%rdi),%xmm3 555 pshufd $0x00,%xmm3,%xmm2 556 pshufd $0x55,%xmm3,%xmm3 557 paddd %xmm2,%xmm12 558 paddd %xmm3,%xmm13 559 # x14[0-3] += s3[2] 560 # x15[0-3] += s3[3] 561 movq 0x38(%rdi),%xmm3 562 pshufd $0x00,%xmm3,%xmm2 563 pshufd $0x55,%xmm3,%xmm3 564 paddd %xmm2,%xmm14 565 paddd %xmm3,%xmm15 566 567 # x12 += counter values 0-3 568 paddd %xmm1,%xmm12 569 570 # interleave 32-bit words in state n, n+1 571 movdqa 0x00(%rsp),%xmm0 572 movdqa 0x10(%rsp),%xmm1 573 movdqa %xmm0,%xmm2 574 punpckldq %xmm1,%xmm2 575 punpckhdq %xmm1,%xmm0 576 movdqa %xmm2,0x00(%rsp) 577 movdqa %xmm0,0x10(%rsp) 578 movdqa 0x20(%rsp),%xmm0 579 movdqa 0x30(%rsp),%xmm1 580 movdqa %xmm0,%xmm2 581 punpckldq %xmm1,%xmm2 582 punpckhdq %xmm1,%xmm0 583 movdqa %xmm2,0x20(%rsp) 584 movdqa %xmm0,0x30(%rsp) 585 movdqa %xmm4,%xmm0 586 punpckldq %xmm5,%xmm4 587 punpckhdq %xmm5,%xmm0 588 movdqa %xmm0,%xmm5 589 movdqa %xmm6,%xmm0 590 punpckldq %xmm7,%xmm6 591 punpckhdq %xmm7,%xmm0 592 movdqa %xmm0,%xmm7 593 movdqa %xmm8,%xmm0 594 punpckldq %xmm9,%xmm8 595 punpckhdq %xmm9,%xmm0 596 movdqa %xmm0,%xmm9 597 movdqa %xmm10,%xmm0 598 punpckldq %xmm11,%xmm10 599 punpckhdq %xmm11,%xmm0 600 movdqa %xmm0,%xmm11 601 movdqa %xmm12,%xmm0 602 punpckldq %xmm13,%xmm12 603 punpckhdq %xmm13,%xmm0 604 movdqa %xmm0,%xmm13 605 movdqa %xmm14,%xmm0 606 punpckldq %xmm15,%xmm14 607 punpckhdq %xmm15,%xmm0 608 movdqa %xmm0,%xmm15 609 610 # interleave 64-bit words in state n, n+2 611 movdqa 0x00(%rsp),%xmm0 612 movdqa 0x20(%rsp),%xmm1 613 movdqa %xmm0,%xmm2 614 punpcklqdq %xmm1,%xmm2 615 punpckhqdq %xmm1,%xmm0 616 movdqa %xmm2,0x00(%rsp) 617 movdqa %xmm0,0x20(%rsp) 618 movdqa 0x10(%rsp),%xmm0 619 movdqa 0x30(%rsp),%xmm1 620 movdqa %xmm0,%xmm2 621 punpcklqdq %xmm1,%xmm2 622 punpckhqdq %xmm1,%xmm0 623 movdqa %xmm2,0x10(%rsp) 624 movdqa %xmm0,0x30(%rsp) 625 movdqa %xmm4,%xmm0 626 punpcklqdq %xmm6,%xmm4 627 punpckhqdq %xmm6,%xmm0 628 movdqa %xmm0,%xmm6 629 movdqa %xmm5,%xmm0 630 punpcklqdq %xmm7,%xmm5 631 punpckhqdq %xmm7,%xmm0 632 movdqa %xmm0,%xmm7 633 movdqa %xmm8,%xmm0 634 punpcklqdq %xmm10,%xmm8 635 punpckhqdq %xmm10,%xmm0 636 movdqa %xmm0,%xmm10 637 movdqa %xmm9,%xmm0 638 punpcklqdq %xmm11,%xmm9 639 punpckhqdq %xmm11,%xmm0 640 movdqa %xmm0,%xmm11 641 movdqa %xmm12,%xmm0 642 punpcklqdq %xmm14,%xmm12 643 punpckhqdq %xmm14,%xmm0 644 movdqa %xmm0,%xmm14 645 movdqa %xmm13,%xmm0 646 punpcklqdq %xmm15,%xmm13 647 punpckhqdq %xmm15,%xmm0 648 movdqa %xmm0,%xmm15 649 650 # xor with corresponding input, write to output 651 movdqa 0x00(%rsp),%xmm0 652 cmp $0x10,%rax 653 jl .Lxorpart4 654 movdqu 0x00(%rdx),%xmm1 655 pxor %xmm1,%xmm0 656 movdqu %xmm0,0x00(%rsi) 657 658 movdqu %xmm4,%xmm0 659 cmp $0x20,%rax 660 jl .Lxorpart4 661 movdqu 0x10(%rdx),%xmm1 662 pxor %xmm1,%xmm0 663 movdqu %xmm0,0x10(%rsi) 664 665 movdqu %xmm8,%xmm0 666 cmp $0x30,%rax 667 jl .Lxorpart4 668 movdqu 0x20(%rdx),%xmm1 669 pxor %xmm1,%xmm0 670 movdqu %xmm0,0x20(%rsi) 671 672 movdqu %xmm12,%xmm0 673 cmp $0x40,%rax 674 jl .Lxorpart4 675 movdqu 0x30(%rdx),%xmm1 676 pxor %xmm1,%xmm0 677 movdqu %xmm0,0x30(%rsi) 678 679 movdqa 0x20(%rsp),%xmm0 680 cmp $0x50,%rax 681 jl .Lxorpart4 682 movdqu 0x40(%rdx),%xmm1 683 pxor %xmm1,%xmm0 684 movdqu %xmm0,0x40(%rsi) 685 686 movdqu %xmm6,%xmm0 687 cmp $0x60,%rax 688 jl .Lxorpart4 689 movdqu 0x50(%rdx),%xmm1 690 pxor %xmm1,%xmm0 691 movdqu %xmm0,0x50(%rsi) 692 693 movdqu %xmm10,%xmm0 694 cmp $0x70,%rax 695 jl .Lxorpart4 696 movdqu 0x60(%rdx),%xmm1 697 pxor %xmm1,%xmm0 698 movdqu %xmm0,0x60(%rsi) 699 700 movdqu %xmm14,%xmm0 701 cmp $0x80,%rax 702 jl .Lxorpart4 703 movdqu 0x70(%rdx),%xmm1 704 pxor %xmm1,%xmm0 705 movdqu %xmm0,0x70(%rsi) 706 707 movdqa 0x10(%rsp),%xmm0 708 cmp $0x90,%rax 709 jl .Lxorpart4 710 movdqu 0x80(%rdx),%xmm1 711 pxor %xmm1,%xmm0 712 movdqu %xmm0,0x80(%rsi) 713 714 movdqu %xmm5,%xmm0 715 cmp $0xa0,%rax 716 jl .Lxorpart4 717 movdqu 0x90(%rdx),%xmm1 718 pxor %xmm1,%xmm0 719 movdqu %xmm0,0x90(%rsi) 720 721 movdqu %xmm9,%xmm0 722 cmp $0xb0,%rax 723 jl .Lxorpart4 724 movdqu 0xa0(%rdx),%xmm1 725 pxor %xmm1,%xmm0 726 movdqu %xmm0,0xa0(%rsi) 727 728 movdqu %xmm13,%xmm0 729 cmp $0xc0,%rax 730 jl .Lxorpart4 731 movdqu 0xb0(%rdx),%xmm1 732 pxor %xmm1,%xmm0 733 movdqu %xmm0,0xb0(%rsi) 734 735 movdqa 0x30(%rsp),%xmm0 736 cmp $0xd0,%rax 737 jl .Lxorpart4 738 movdqu 0xc0(%rdx),%xmm1 739 pxor %xmm1,%xmm0 740 movdqu %xmm0,0xc0(%rsi) 741 742 movdqu %xmm7,%xmm0 743 cmp $0xe0,%rax 744 jl .Lxorpart4 745 movdqu 0xd0(%rdx),%xmm1 746 pxor %xmm1,%xmm0 747 movdqu %xmm0,0xd0(%rsi) 748 749 movdqu %xmm11,%xmm0 750 cmp $0xf0,%rax 751 jl .Lxorpart4 752 movdqu 0xe0(%rdx),%xmm1 753 pxor %xmm1,%xmm0 754 movdqu %xmm0,0xe0(%rsi) 755 756 movdqu %xmm15,%xmm0 757 cmp $0x100,%rax 758 jl .Lxorpart4 759 movdqu 0xf0(%rdx),%xmm1 760 pxor %xmm1,%xmm0 761 movdqu %xmm0,0xf0(%rsi) 762 763.Ldone4: 764 lea -8(%r10),%rsp 765 ret 766 767.Lxorpart4: 768 # xor remaining bytes from partial register into output 769 mov %rax,%r9 770 and $0x0f,%r9 771 jz .Ldone4 772 and $~0x0f,%rax 773 774 mov %rsi,%r11 775 776 lea (%rdx,%rax),%rsi 777 mov %rsp,%rdi 778 mov %r9,%rcx 779 rep movsb 780 781 pxor 0x00(%rsp),%xmm0 782 movdqa %xmm0,0x00(%rsp) 783 784 mov %rsp,%rsi 785 lea (%r11,%rax),%rdi 786 mov %r9,%rcx 787 rep movsb 788 789 jmp .Ldone4 790 791ENDPROC(chacha_4block_xor_ssse3) 792