1/* 2 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions 3 * 4 * Copyright (C) 2015 Martin Willi 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 */ 11 12#include <linux/linkage.h> 13 14.section .rodata.cst16.ROT8, "aM", @progbits, 16 15.align 16 16ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 17.section .rodata.cst16.ROT16, "aM", @progbits, 16 18.align 16 19ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 20.section .rodata.cst16.CTRINC, "aM", @progbits, 16 21.align 16 22CTRINC: .octa 0x00000003000000020000000100000000 23 24.text 25 26ENTRY(chacha20_block_xor_ssse3) 27 # %rdi: Input state matrix, s 28 # %rsi: 1 data block output, o 29 # %rdx: 1 data block input, i 30 31 # This function encrypts one ChaCha20 block by loading the state matrix 32 # in four SSE registers. It performs matrix operation on four words in 33 # parallel, but requireds shuffling to rearrange the words after each 34 # round. 8/16-bit word rotation is done with the slightly better 35 # performing SSSE3 byte shuffling, 7/12-bit word rotation uses 36 # traditional shift+OR. 37 38 # x0..3 = s0..3 39 movdqa 0x00(%rdi),%xmm0 40 movdqa 0x10(%rdi),%xmm1 41 movdqa 0x20(%rdi),%xmm2 42 movdqa 0x30(%rdi),%xmm3 43 movdqa %xmm0,%xmm8 44 movdqa %xmm1,%xmm9 45 movdqa %xmm2,%xmm10 46 movdqa %xmm3,%xmm11 47 48 movdqa ROT8(%rip),%xmm4 49 movdqa ROT16(%rip),%xmm5 50 51 mov $10,%ecx 52 53.Ldoubleround: 54 55 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 56 paddd %xmm1,%xmm0 57 pxor %xmm0,%xmm3 58 pshufb %xmm5,%xmm3 59 60 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 61 paddd %xmm3,%xmm2 62 pxor %xmm2,%xmm1 63 movdqa %xmm1,%xmm6 64 pslld $12,%xmm6 65 psrld $20,%xmm1 66 por %xmm6,%xmm1 67 68 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 69 paddd %xmm1,%xmm0 70 pxor %xmm0,%xmm3 71 pshufb %xmm4,%xmm3 72 73 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 74 paddd %xmm3,%xmm2 75 pxor %xmm2,%xmm1 76 movdqa %xmm1,%xmm7 77 pslld $7,%xmm7 78 psrld $25,%xmm1 79 por %xmm7,%xmm1 80 81 # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 82 pshufd $0x39,%xmm1,%xmm1 83 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 84 pshufd $0x4e,%xmm2,%xmm2 85 # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 86 pshufd $0x93,%xmm3,%xmm3 87 88 # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 89 paddd %xmm1,%xmm0 90 pxor %xmm0,%xmm3 91 pshufb %xmm5,%xmm3 92 93 # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 94 paddd %xmm3,%xmm2 95 pxor %xmm2,%xmm1 96 movdqa %xmm1,%xmm6 97 pslld $12,%xmm6 98 psrld $20,%xmm1 99 por %xmm6,%xmm1 100 101 # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 102 paddd %xmm1,%xmm0 103 pxor %xmm0,%xmm3 104 pshufb %xmm4,%xmm3 105 106 # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 107 paddd %xmm3,%xmm2 108 pxor %xmm2,%xmm1 109 movdqa %xmm1,%xmm7 110 pslld $7,%xmm7 111 psrld $25,%xmm1 112 por %xmm7,%xmm1 113 114 # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 115 pshufd $0x93,%xmm1,%xmm1 116 # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 117 pshufd $0x4e,%xmm2,%xmm2 118 # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 119 pshufd $0x39,%xmm3,%xmm3 120 121 dec %ecx 122 jnz .Ldoubleround 123 124 # o0 = i0 ^ (x0 + s0) 125 movdqu 0x00(%rdx),%xmm4 126 paddd %xmm8,%xmm0 127 pxor %xmm4,%xmm0 128 movdqu %xmm0,0x00(%rsi) 129 # o1 = i1 ^ (x1 + s1) 130 movdqu 0x10(%rdx),%xmm5 131 paddd %xmm9,%xmm1 132 pxor %xmm5,%xmm1 133 movdqu %xmm1,0x10(%rsi) 134 # o2 = i2 ^ (x2 + s2) 135 movdqu 0x20(%rdx),%xmm6 136 paddd %xmm10,%xmm2 137 pxor %xmm6,%xmm2 138 movdqu %xmm2,0x20(%rsi) 139 # o3 = i3 ^ (x3 + s3) 140 movdqu 0x30(%rdx),%xmm7 141 paddd %xmm11,%xmm3 142 pxor %xmm7,%xmm3 143 movdqu %xmm3,0x30(%rsi) 144 145 ret 146ENDPROC(chacha20_block_xor_ssse3) 147 148ENTRY(chacha20_4block_xor_ssse3) 149 # %rdi: Input state matrix, s 150 # %rsi: 4 data blocks output, o 151 # %rdx: 4 data blocks input, i 152 153 # This function encrypts four consecutive ChaCha20 blocks by loading the 154 # the state matrix in SSE registers four times. As we need some scratch 155 # registers, we save the first four registers on the stack. The 156 # algorithm performs each operation on the corresponding word of each 157 # state matrix, hence requires no word shuffling. For final XORing step 158 # we transpose the matrix by interleaving 32- and then 64-bit words, 159 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is 160 # done with the slightly better performing SSSE3 byte shuffling, 161 # 7/12-bit word rotation uses traditional shift+OR. 162 163 lea 8(%rsp),%r10 164 sub $0x80,%rsp 165 and $~63,%rsp 166 167 # x0..15[0-3] = s0..3[0..3] 168 movq 0x00(%rdi),%xmm1 169 pshufd $0x00,%xmm1,%xmm0 170 pshufd $0x55,%xmm1,%xmm1 171 movq 0x08(%rdi),%xmm3 172 pshufd $0x00,%xmm3,%xmm2 173 pshufd $0x55,%xmm3,%xmm3 174 movq 0x10(%rdi),%xmm5 175 pshufd $0x00,%xmm5,%xmm4 176 pshufd $0x55,%xmm5,%xmm5 177 movq 0x18(%rdi),%xmm7 178 pshufd $0x00,%xmm7,%xmm6 179 pshufd $0x55,%xmm7,%xmm7 180 movq 0x20(%rdi),%xmm9 181 pshufd $0x00,%xmm9,%xmm8 182 pshufd $0x55,%xmm9,%xmm9 183 movq 0x28(%rdi),%xmm11 184 pshufd $0x00,%xmm11,%xmm10 185 pshufd $0x55,%xmm11,%xmm11 186 movq 0x30(%rdi),%xmm13 187 pshufd $0x00,%xmm13,%xmm12 188 pshufd $0x55,%xmm13,%xmm13 189 movq 0x38(%rdi),%xmm15 190 pshufd $0x00,%xmm15,%xmm14 191 pshufd $0x55,%xmm15,%xmm15 192 # x0..3 on stack 193 movdqa %xmm0,0x00(%rsp) 194 movdqa %xmm1,0x10(%rsp) 195 movdqa %xmm2,0x20(%rsp) 196 movdqa %xmm3,0x30(%rsp) 197 198 movdqa CTRINC(%rip),%xmm1 199 movdqa ROT8(%rip),%xmm2 200 movdqa ROT16(%rip),%xmm3 201 202 # x12 += counter values 0-3 203 paddd %xmm1,%xmm12 204 205 mov $10,%ecx 206 207.Ldoubleround4: 208 # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 209 movdqa 0x00(%rsp),%xmm0 210 paddd %xmm4,%xmm0 211 movdqa %xmm0,0x00(%rsp) 212 pxor %xmm0,%xmm12 213 pshufb %xmm3,%xmm12 214 # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 215 movdqa 0x10(%rsp),%xmm0 216 paddd %xmm5,%xmm0 217 movdqa %xmm0,0x10(%rsp) 218 pxor %xmm0,%xmm13 219 pshufb %xmm3,%xmm13 220 # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 221 movdqa 0x20(%rsp),%xmm0 222 paddd %xmm6,%xmm0 223 movdqa %xmm0,0x20(%rsp) 224 pxor %xmm0,%xmm14 225 pshufb %xmm3,%xmm14 226 # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 227 movdqa 0x30(%rsp),%xmm0 228 paddd %xmm7,%xmm0 229 movdqa %xmm0,0x30(%rsp) 230 pxor %xmm0,%xmm15 231 pshufb %xmm3,%xmm15 232 233 # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 234 paddd %xmm12,%xmm8 235 pxor %xmm8,%xmm4 236 movdqa %xmm4,%xmm0 237 pslld $12,%xmm0 238 psrld $20,%xmm4 239 por %xmm0,%xmm4 240 # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 241 paddd %xmm13,%xmm9 242 pxor %xmm9,%xmm5 243 movdqa %xmm5,%xmm0 244 pslld $12,%xmm0 245 psrld $20,%xmm5 246 por %xmm0,%xmm5 247 # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 248 paddd %xmm14,%xmm10 249 pxor %xmm10,%xmm6 250 movdqa %xmm6,%xmm0 251 pslld $12,%xmm0 252 psrld $20,%xmm6 253 por %xmm0,%xmm6 254 # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 255 paddd %xmm15,%xmm11 256 pxor %xmm11,%xmm7 257 movdqa %xmm7,%xmm0 258 pslld $12,%xmm0 259 psrld $20,%xmm7 260 por %xmm0,%xmm7 261 262 # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 263 movdqa 0x00(%rsp),%xmm0 264 paddd %xmm4,%xmm0 265 movdqa %xmm0,0x00(%rsp) 266 pxor %xmm0,%xmm12 267 pshufb %xmm2,%xmm12 268 # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 269 movdqa 0x10(%rsp),%xmm0 270 paddd %xmm5,%xmm0 271 movdqa %xmm0,0x10(%rsp) 272 pxor %xmm0,%xmm13 273 pshufb %xmm2,%xmm13 274 # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 275 movdqa 0x20(%rsp),%xmm0 276 paddd %xmm6,%xmm0 277 movdqa %xmm0,0x20(%rsp) 278 pxor %xmm0,%xmm14 279 pshufb %xmm2,%xmm14 280 # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 281 movdqa 0x30(%rsp),%xmm0 282 paddd %xmm7,%xmm0 283 movdqa %xmm0,0x30(%rsp) 284 pxor %xmm0,%xmm15 285 pshufb %xmm2,%xmm15 286 287 # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 288 paddd %xmm12,%xmm8 289 pxor %xmm8,%xmm4 290 movdqa %xmm4,%xmm0 291 pslld $7,%xmm0 292 psrld $25,%xmm4 293 por %xmm0,%xmm4 294 # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 295 paddd %xmm13,%xmm9 296 pxor %xmm9,%xmm5 297 movdqa %xmm5,%xmm0 298 pslld $7,%xmm0 299 psrld $25,%xmm5 300 por %xmm0,%xmm5 301 # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 302 paddd %xmm14,%xmm10 303 pxor %xmm10,%xmm6 304 movdqa %xmm6,%xmm0 305 pslld $7,%xmm0 306 psrld $25,%xmm6 307 por %xmm0,%xmm6 308 # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 309 paddd %xmm15,%xmm11 310 pxor %xmm11,%xmm7 311 movdqa %xmm7,%xmm0 312 pslld $7,%xmm0 313 psrld $25,%xmm7 314 por %xmm0,%xmm7 315 316 # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 317 movdqa 0x00(%rsp),%xmm0 318 paddd %xmm5,%xmm0 319 movdqa %xmm0,0x00(%rsp) 320 pxor %xmm0,%xmm15 321 pshufb %xmm3,%xmm15 322 # x1 += x6, x12 = rotl32(x12 ^ x1, 16) 323 movdqa 0x10(%rsp),%xmm0 324 paddd %xmm6,%xmm0 325 movdqa %xmm0,0x10(%rsp) 326 pxor %xmm0,%xmm12 327 pshufb %xmm3,%xmm12 328 # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 329 movdqa 0x20(%rsp),%xmm0 330 paddd %xmm7,%xmm0 331 movdqa %xmm0,0x20(%rsp) 332 pxor %xmm0,%xmm13 333 pshufb %xmm3,%xmm13 334 # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 335 movdqa 0x30(%rsp),%xmm0 336 paddd %xmm4,%xmm0 337 movdqa %xmm0,0x30(%rsp) 338 pxor %xmm0,%xmm14 339 pshufb %xmm3,%xmm14 340 341 # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 342 paddd %xmm15,%xmm10 343 pxor %xmm10,%xmm5 344 movdqa %xmm5,%xmm0 345 pslld $12,%xmm0 346 psrld $20,%xmm5 347 por %xmm0,%xmm5 348 # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 349 paddd %xmm12,%xmm11 350 pxor %xmm11,%xmm6 351 movdqa %xmm6,%xmm0 352 pslld $12,%xmm0 353 psrld $20,%xmm6 354 por %xmm0,%xmm6 355 # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 356 paddd %xmm13,%xmm8 357 pxor %xmm8,%xmm7 358 movdqa %xmm7,%xmm0 359 pslld $12,%xmm0 360 psrld $20,%xmm7 361 por %xmm0,%xmm7 362 # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 363 paddd %xmm14,%xmm9 364 pxor %xmm9,%xmm4 365 movdqa %xmm4,%xmm0 366 pslld $12,%xmm0 367 psrld $20,%xmm4 368 por %xmm0,%xmm4 369 370 # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 371 movdqa 0x00(%rsp),%xmm0 372 paddd %xmm5,%xmm0 373 movdqa %xmm0,0x00(%rsp) 374 pxor %xmm0,%xmm15 375 pshufb %xmm2,%xmm15 376 # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 377 movdqa 0x10(%rsp),%xmm0 378 paddd %xmm6,%xmm0 379 movdqa %xmm0,0x10(%rsp) 380 pxor %xmm0,%xmm12 381 pshufb %xmm2,%xmm12 382 # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 383 movdqa 0x20(%rsp),%xmm0 384 paddd %xmm7,%xmm0 385 movdqa %xmm0,0x20(%rsp) 386 pxor %xmm0,%xmm13 387 pshufb %xmm2,%xmm13 388 # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 389 movdqa 0x30(%rsp),%xmm0 390 paddd %xmm4,%xmm0 391 movdqa %xmm0,0x30(%rsp) 392 pxor %xmm0,%xmm14 393 pshufb %xmm2,%xmm14 394 395 # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 396 paddd %xmm15,%xmm10 397 pxor %xmm10,%xmm5 398 movdqa %xmm5,%xmm0 399 pslld $7,%xmm0 400 psrld $25,%xmm5 401 por %xmm0,%xmm5 402 # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 403 paddd %xmm12,%xmm11 404 pxor %xmm11,%xmm6 405 movdqa %xmm6,%xmm0 406 pslld $7,%xmm0 407 psrld $25,%xmm6 408 por %xmm0,%xmm6 409 # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 410 paddd %xmm13,%xmm8 411 pxor %xmm8,%xmm7 412 movdqa %xmm7,%xmm0 413 pslld $7,%xmm0 414 psrld $25,%xmm7 415 por %xmm0,%xmm7 416 # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 417 paddd %xmm14,%xmm9 418 pxor %xmm9,%xmm4 419 movdqa %xmm4,%xmm0 420 pslld $7,%xmm0 421 psrld $25,%xmm4 422 por %xmm0,%xmm4 423 424 dec %ecx 425 jnz .Ldoubleround4 426 427 # x0[0-3] += s0[0] 428 # x1[0-3] += s0[1] 429 movq 0x00(%rdi),%xmm3 430 pshufd $0x00,%xmm3,%xmm2 431 pshufd $0x55,%xmm3,%xmm3 432 paddd 0x00(%rsp),%xmm2 433 movdqa %xmm2,0x00(%rsp) 434 paddd 0x10(%rsp),%xmm3 435 movdqa %xmm3,0x10(%rsp) 436 # x2[0-3] += s0[2] 437 # x3[0-3] += s0[3] 438 movq 0x08(%rdi),%xmm3 439 pshufd $0x00,%xmm3,%xmm2 440 pshufd $0x55,%xmm3,%xmm3 441 paddd 0x20(%rsp),%xmm2 442 movdqa %xmm2,0x20(%rsp) 443 paddd 0x30(%rsp),%xmm3 444 movdqa %xmm3,0x30(%rsp) 445 446 # x4[0-3] += s1[0] 447 # x5[0-3] += s1[1] 448 movq 0x10(%rdi),%xmm3 449 pshufd $0x00,%xmm3,%xmm2 450 pshufd $0x55,%xmm3,%xmm3 451 paddd %xmm2,%xmm4 452 paddd %xmm3,%xmm5 453 # x6[0-3] += s1[2] 454 # x7[0-3] += s1[3] 455 movq 0x18(%rdi),%xmm3 456 pshufd $0x00,%xmm3,%xmm2 457 pshufd $0x55,%xmm3,%xmm3 458 paddd %xmm2,%xmm6 459 paddd %xmm3,%xmm7 460 461 # x8[0-3] += s2[0] 462 # x9[0-3] += s2[1] 463 movq 0x20(%rdi),%xmm3 464 pshufd $0x00,%xmm3,%xmm2 465 pshufd $0x55,%xmm3,%xmm3 466 paddd %xmm2,%xmm8 467 paddd %xmm3,%xmm9 468 # x10[0-3] += s2[2] 469 # x11[0-3] += s2[3] 470 movq 0x28(%rdi),%xmm3 471 pshufd $0x00,%xmm3,%xmm2 472 pshufd $0x55,%xmm3,%xmm3 473 paddd %xmm2,%xmm10 474 paddd %xmm3,%xmm11 475 476 # x12[0-3] += s3[0] 477 # x13[0-3] += s3[1] 478 movq 0x30(%rdi),%xmm3 479 pshufd $0x00,%xmm3,%xmm2 480 pshufd $0x55,%xmm3,%xmm3 481 paddd %xmm2,%xmm12 482 paddd %xmm3,%xmm13 483 # x14[0-3] += s3[2] 484 # x15[0-3] += s3[3] 485 movq 0x38(%rdi),%xmm3 486 pshufd $0x00,%xmm3,%xmm2 487 pshufd $0x55,%xmm3,%xmm3 488 paddd %xmm2,%xmm14 489 paddd %xmm3,%xmm15 490 491 # x12 += counter values 0-3 492 paddd %xmm1,%xmm12 493 494 # interleave 32-bit words in state n, n+1 495 movdqa 0x00(%rsp),%xmm0 496 movdqa 0x10(%rsp),%xmm1 497 movdqa %xmm0,%xmm2 498 punpckldq %xmm1,%xmm2 499 punpckhdq %xmm1,%xmm0 500 movdqa %xmm2,0x00(%rsp) 501 movdqa %xmm0,0x10(%rsp) 502 movdqa 0x20(%rsp),%xmm0 503 movdqa 0x30(%rsp),%xmm1 504 movdqa %xmm0,%xmm2 505 punpckldq %xmm1,%xmm2 506 punpckhdq %xmm1,%xmm0 507 movdqa %xmm2,0x20(%rsp) 508 movdqa %xmm0,0x30(%rsp) 509 movdqa %xmm4,%xmm0 510 punpckldq %xmm5,%xmm4 511 punpckhdq %xmm5,%xmm0 512 movdqa %xmm0,%xmm5 513 movdqa %xmm6,%xmm0 514 punpckldq %xmm7,%xmm6 515 punpckhdq %xmm7,%xmm0 516 movdqa %xmm0,%xmm7 517 movdqa %xmm8,%xmm0 518 punpckldq %xmm9,%xmm8 519 punpckhdq %xmm9,%xmm0 520 movdqa %xmm0,%xmm9 521 movdqa %xmm10,%xmm0 522 punpckldq %xmm11,%xmm10 523 punpckhdq %xmm11,%xmm0 524 movdqa %xmm0,%xmm11 525 movdqa %xmm12,%xmm0 526 punpckldq %xmm13,%xmm12 527 punpckhdq %xmm13,%xmm0 528 movdqa %xmm0,%xmm13 529 movdqa %xmm14,%xmm0 530 punpckldq %xmm15,%xmm14 531 punpckhdq %xmm15,%xmm0 532 movdqa %xmm0,%xmm15 533 534 # interleave 64-bit words in state n, n+2 535 movdqa 0x00(%rsp),%xmm0 536 movdqa 0x20(%rsp),%xmm1 537 movdqa %xmm0,%xmm2 538 punpcklqdq %xmm1,%xmm2 539 punpckhqdq %xmm1,%xmm0 540 movdqa %xmm2,0x00(%rsp) 541 movdqa %xmm0,0x20(%rsp) 542 movdqa 0x10(%rsp),%xmm0 543 movdqa 0x30(%rsp),%xmm1 544 movdqa %xmm0,%xmm2 545 punpcklqdq %xmm1,%xmm2 546 punpckhqdq %xmm1,%xmm0 547 movdqa %xmm2,0x10(%rsp) 548 movdqa %xmm0,0x30(%rsp) 549 movdqa %xmm4,%xmm0 550 punpcklqdq %xmm6,%xmm4 551 punpckhqdq %xmm6,%xmm0 552 movdqa %xmm0,%xmm6 553 movdqa %xmm5,%xmm0 554 punpcklqdq %xmm7,%xmm5 555 punpckhqdq %xmm7,%xmm0 556 movdqa %xmm0,%xmm7 557 movdqa %xmm8,%xmm0 558 punpcklqdq %xmm10,%xmm8 559 punpckhqdq %xmm10,%xmm0 560 movdqa %xmm0,%xmm10 561 movdqa %xmm9,%xmm0 562 punpcklqdq %xmm11,%xmm9 563 punpckhqdq %xmm11,%xmm0 564 movdqa %xmm0,%xmm11 565 movdqa %xmm12,%xmm0 566 punpcklqdq %xmm14,%xmm12 567 punpckhqdq %xmm14,%xmm0 568 movdqa %xmm0,%xmm14 569 movdqa %xmm13,%xmm0 570 punpcklqdq %xmm15,%xmm13 571 punpckhqdq %xmm15,%xmm0 572 movdqa %xmm0,%xmm15 573 574 # xor with corresponding input, write to output 575 movdqa 0x00(%rsp),%xmm0 576 movdqu 0x00(%rdx),%xmm1 577 pxor %xmm1,%xmm0 578 movdqu %xmm0,0x00(%rsi) 579 movdqa 0x10(%rsp),%xmm0 580 movdqu 0x80(%rdx),%xmm1 581 pxor %xmm1,%xmm0 582 movdqu %xmm0,0x80(%rsi) 583 movdqa 0x20(%rsp),%xmm0 584 movdqu 0x40(%rdx),%xmm1 585 pxor %xmm1,%xmm0 586 movdqu %xmm0,0x40(%rsi) 587 movdqa 0x30(%rsp),%xmm0 588 movdqu 0xc0(%rdx),%xmm1 589 pxor %xmm1,%xmm0 590 movdqu %xmm0,0xc0(%rsi) 591 movdqu 0x10(%rdx),%xmm1 592 pxor %xmm1,%xmm4 593 movdqu %xmm4,0x10(%rsi) 594 movdqu 0x90(%rdx),%xmm1 595 pxor %xmm1,%xmm5 596 movdqu %xmm5,0x90(%rsi) 597 movdqu 0x50(%rdx),%xmm1 598 pxor %xmm1,%xmm6 599 movdqu %xmm6,0x50(%rsi) 600 movdqu 0xd0(%rdx),%xmm1 601 pxor %xmm1,%xmm7 602 movdqu %xmm7,0xd0(%rsi) 603 movdqu 0x20(%rdx),%xmm1 604 pxor %xmm1,%xmm8 605 movdqu %xmm8,0x20(%rsi) 606 movdqu 0xa0(%rdx),%xmm1 607 pxor %xmm1,%xmm9 608 movdqu %xmm9,0xa0(%rsi) 609 movdqu 0x60(%rdx),%xmm1 610 pxor %xmm1,%xmm10 611 movdqu %xmm10,0x60(%rsi) 612 movdqu 0xe0(%rdx),%xmm1 613 pxor %xmm1,%xmm11 614 movdqu %xmm11,0xe0(%rsi) 615 movdqu 0x30(%rdx),%xmm1 616 pxor %xmm1,%xmm12 617 movdqu %xmm12,0x30(%rsi) 618 movdqu 0xb0(%rdx),%xmm1 619 pxor %xmm1,%xmm13 620 movdqu %xmm13,0xb0(%rsi) 621 movdqu 0x70(%rdx),%xmm1 622 pxor %xmm1,%xmm14 623 movdqu %xmm14,0x70(%rsi) 624 movdqu 0xf0(%rdx),%xmm1 625 pxor %xmm1,%xmm15 626 movdqu %xmm15,0xf0(%rsi) 627 628 lea -8(%r10),%rsp 629 ret 630ENDPROC(chacha20_4block_xor_ssse3) 631