1/* 2 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions 3 * 4 * Copyright (C) 2015 Martin Willi 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 */ 11 12#include <linux/linkage.h> 13 14.section .rodata.cst32.ROT8, "aM", @progbits, 32 15.align 32 16ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 17 .octa 0x0e0d0c0f0a09080b0605040702010003 18 19.section .rodata.cst32.ROT16, "aM", @progbits, 32 20.align 32 21ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 22 .octa 0x0d0c0f0e09080b0a0504070601000302 23 24.section .rodata.cst32.CTRINC, "aM", @progbits, 32 25.align 32 26CTRINC: .octa 0x00000003000000020000000100000000 27 .octa 0x00000007000000060000000500000004 28 29.text 30 31ENTRY(chacha20_8block_xor_avx2) 32 # %rdi: Input state matrix, s 33 # %rsi: 8 data blocks output, o 34 # %rdx: 8 data blocks input, i 35 36 # This function encrypts eight consecutive ChaCha20 blocks by loading 37 # the state matrix in AVX registers eight times. As we need some 38 # scratch registers, we save the first four registers on the stack. The 39 # algorithm performs each operation on the corresponding word of each 40 # state matrix, hence requires no word shuffling. For final XORing step 41 # we transpose the matrix by interleaving 32-, 64- and then 128-bit 42 # words, which allows us to do XOR in AVX registers. 8/16-bit word 43 # rotation is done with the slightly better performing byte shuffling, 44 # 7/12-bit word rotation uses traditional shift+OR. 45 46 vzeroupper 47 # 4 * 32 byte stack, 32-byte aligned 48 lea 8(%rsp),%r10 49 and $~31, %rsp 50 sub $0x80, %rsp 51 52 # x0..15[0-7] = s[0..15] 53 vpbroadcastd 0x00(%rdi),%ymm0 54 vpbroadcastd 0x04(%rdi),%ymm1 55 vpbroadcastd 0x08(%rdi),%ymm2 56 vpbroadcastd 0x0c(%rdi),%ymm3 57 vpbroadcastd 0x10(%rdi),%ymm4 58 vpbroadcastd 0x14(%rdi),%ymm5 59 vpbroadcastd 0x18(%rdi),%ymm6 60 vpbroadcastd 0x1c(%rdi),%ymm7 61 vpbroadcastd 0x20(%rdi),%ymm8 62 vpbroadcastd 0x24(%rdi),%ymm9 63 vpbroadcastd 0x28(%rdi),%ymm10 64 vpbroadcastd 0x2c(%rdi),%ymm11 65 vpbroadcastd 0x30(%rdi),%ymm12 66 vpbroadcastd 0x34(%rdi),%ymm13 67 vpbroadcastd 0x38(%rdi),%ymm14 68 vpbroadcastd 0x3c(%rdi),%ymm15 69 # x0..3 on stack 70 vmovdqa %ymm0,0x00(%rsp) 71 vmovdqa %ymm1,0x20(%rsp) 72 vmovdqa %ymm2,0x40(%rsp) 73 vmovdqa %ymm3,0x60(%rsp) 74 75 vmovdqa CTRINC(%rip),%ymm1 76 vmovdqa ROT8(%rip),%ymm2 77 vmovdqa ROT16(%rip),%ymm3 78 79 # x12 += counter values 0-3 80 vpaddd %ymm1,%ymm12,%ymm12 81 82 mov $10,%ecx 83 84.Ldoubleround8: 85 # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 86 vpaddd 0x00(%rsp),%ymm4,%ymm0 87 vmovdqa %ymm0,0x00(%rsp) 88 vpxor %ymm0,%ymm12,%ymm12 89 vpshufb %ymm3,%ymm12,%ymm12 90 # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 91 vpaddd 0x20(%rsp),%ymm5,%ymm0 92 vmovdqa %ymm0,0x20(%rsp) 93 vpxor %ymm0,%ymm13,%ymm13 94 vpshufb %ymm3,%ymm13,%ymm13 95 # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 96 vpaddd 0x40(%rsp),%ymm6,%ymm0 97 vmovdqa %ymm0,0x40(%rsp) 98 vpxor %ymm0,%ymm14,%ymm14 99 vpshufb %ymm3,%ymm14,%ymm14 100 # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 101 vpaddd 0x60(%rsp),%ymm7,%ymm0 102 vmovdqa %ymm0,0x60(%rsp) 103 vpxor %ymm0,%ymm15,%ymm15 104 vpshufb %ymm3,%ymm15,%ymm15 105 106 # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 107 vpaddd %ymm12,%ymm8,%ymm8 108 vpxor %ymm8,%ymm4,%ymm4 109 vpslld $12,%ymm4,%ymm0 110 vpsrld $20,%ymm4,%ymm4 111 vpor %ymm0,%ymm4,%ymm4 112 # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 113 vpaddd %ymm13,%ymm9,%ymm9 114 vpxor %ymm9,%ymm5,%ymm5 115 vpslld $12,%ymm5,%ymm0 116 vpsrld $20,%ymm5,%ymm5 117 vpor %ymm0,%ymm5,%ymm5 118 # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 119 vpaddd %ymm14,%ymm10,%ymm10 120 vpxor %ymm10,%ymm6,%ymm6 121 vpslld $12,%ymm6,%ymm0 122 vpsrld $20,%ymm6,%ymm6 123 vpor %ymm0,%ymm6,%ymm6 124 # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 125 vpaddd %ymm15,%ymm11,%ymm11 126 vpxor %ymm11,%ymm7,%ymm7 127 vpslld $12,%ymm7,%ymm0 128 vpsrld $20,%ymm7,%ymm7 129 vpor %ymm0,%ymm7,%ymm7 130 131 # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 132 vpaddd 0x00(%rsp),%ymm4,%ymm0 133 vmovdqa %ymm0,0x00(%rsp) 134 vpxor %ymm0,%ymm12,%ymm12 135 vpshufb %ymm2,%ymm12,%ymm12 136 # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 137 vpaddd 0x20(%rsp),%ymm5,%ymm0 138 vmovdqa %ymm0,0x20(%rsp) 139 vpxor %ymm0,%ymm13,%ymm13 140 vpshufb %ymm2,%ymm13,%ymm13 141 # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 142 vpaddd 0x40(%rsp),%ymm6,%ymm0 143 vmovdqa %ymm0,0x40(%rsp) 144 vpxor %ymm0,%ymm14,%ymm14 145 vpshufb %ymm2,%ymm14,%ymm14 146 # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 147 vpaddd 0x60(%rsp),%ymm7,%ymm0 148 vmovdqa %ymm0,0x60(%rsp) 149 vpxor %ymm0,%ymm15,%ymm15 150 vpshufb %ymm2,%ymm15,%ymm15 151 152 # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 153 vpaddd %ymm12,%ymm8,%ymm8 154 vpxor %ymm8,%ymm4,%ymm4 155 vpslld $7,%ymm4,%ymm0 156 vpsrld $25,%ymm4,%ymm4 157 vpor %ymm0,%ymm4,%ymm4 158 # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 159 vpaddd %ymm13,%ymm9,%ymm9 160 vpxor %ymm9,%ymm5,%ymm5 161 vpslld $7,%ymm5,%ymm0 162 vpsrld $25,%ymm5,%ymm5 163 vpor %ymm0,%ymm5,%ymm5 164 # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 165 vpaddd %ymm14,%ymm10,%ymm10 166 vpxor %ymm10,%ymm6,%ymm6 167 vpslld $7,%ymm6,%ymm0 168 vpsrld $25,%ymm6,%ymm6 169 vpor %ymm0,%ymm6,%ymm6 170 # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 171 vpaddd %ymm15,%ymm11,%ymm11 172 vpxor %ymm11,%ymm7,%ymm7 173 vpslld $7,%ymm7,%ymm0 174 vpsrld $25,%ymm7,%ymm7 175 vpor %ymm0,%ymm7,%ymm7 176 177 # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 178 vpaddd 0x00(%rsp),%ymm5,%ymm0 179 vmovdqa %ymm0,0x00(%rsp) 180 vpxor %ymm0,%ymm15,%ymm15 181 vpshufb %ymm3,%ymm15,%ymm15 182 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 183 vpaddd 0x20(%rsp),%ymm6,%ymm0 184 vmovdqa %ymm0,0x20(%rsp) 185 vpxor %ymm0,%ymm12,%ymm12 186 vpshufb %ymm3,%ymm12,%ymm12 187 # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 188 vpaddd 0x40(%rsp),%ymm7,%ymm0 189 vmovdqa %ymm0,0x40(%rsp) 190 vpxor %ymm0,%ymm13,%ymm13 191 vpshufb %ymm3,%ymm13,%ymm13 192 # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 193 vpaddd 0x60(%rsp),%ymm4,%ymm0 194 vmovdqa %ymm0,0x60(%rsp) 195 vpxor %ymm0,%ymm14,%ymm14 196 vpshufb %ymm3,%ymm14,%ymm14 197 198 # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 199 vpaddd %ymm15,%ymm10,%ymm10 200 vpxor %ymm10,%ymm5,%ymm5 201 vpslld $12,%ymm5,%ymm0 202 vpsrld $20,%ymm5,%ymm5 203 vpor %ymm0,%ymm5,%ymm5 204 # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 205 vpaddd %ymm12,%ymm11,%ymm11 206 vpxor %ymm11,%ymm6,%ymm6 207 vpslld $12,%ymm6,%ymm0 208 vpsrld $20,%ymm6,%ymm6 209 vpor %ymm0,%ymm6,%ymm6 210 # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 211 vpaddd %ymm13,%ymm8,%ymm8 212 vpxor %ymm8,%ymm7,%ymm7 213 vpslld $12,%ymm7,%ymm0 214 vpsrld $20,%ymm7,%ymm7 215 vpor %ymm0,%ymm7,%ymm7 216 # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 217 vpaddd %ymm14,%ymm9,%ymm9 218 vpxor %ymm9,%ymm4,%ymm4 219 vpslld $12,%ymm4,%ymm0 220 vpsrld $20,%ymm4,%ymm4 221 vpor %ymm0,%ymm4,%ymm4 222 223 # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 224 vpaddd 0x00(%rsp),%ymm5,%ymm0 225 vmovdqa %ymm0,0x00(%rsp) 226 vpxor %ymm0,%ymm15,%ymm15 227 vpshufb %ymm2,%ymm15,%ymm15 228 # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 229 vpaddd 0x20(%rsp),%ymm6,%ymm0 230 vmovdqa %ymm0,0x20(%rsp) 231 vpxor %ymm0,%ymm12,%ymm12 232 vpshufb %ymm2,%ymm12,%ymm12 233 # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 234 vpaddd 0x40(%rsp),%ymm7,%ymm0 235 vmovdqa %ymm0,0x40(%rsp) 236 vpxor %ymm0,%ymm13,%ymm13 237 vpshufb %ymm2,%ymm13,%ymm13 238 # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 239 vpaddd 0x60(%rsp),%ymm4,%ymm0 240 vmovdqa %ymm0,0x60(%rsp) 241 vpxor %ymm0,%ymm14,%ymm14 242 vpshufb %ymm2,%ymm14,%ymm14 243 244 # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 245 vpaddd %ymm15,%ymm10,%ymm10 246 vpxor %ymm10,%ymm5,%ymm5 247 vpslld $7,%ymm5,%ymm0 248 vpsrld $25,%ymm5,%ymm5 249 vpor %ymm0,%ymm5,%ymm5 250 # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 251 vpaddd %ymm12,%ymm11,%ymm11 252 vpxor %ymm11,%ymm6,%ymm6 253 vpslld $7,%ymm6,%ymm0 254 vpsrld $25,%ymm6,%ymm6 255 vpor %ymm0,%ymm6,%ymm6 256 # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 257 vpaddd %ymm13,%ymm8,%ymm8 258 vpxor %ymm8,%ymm7,%ymm7 259 vpslld $7,%ymm7,%ymm0 260 vpsrld $25,%ymm7,%ymm7 261 vpor %ymm0,%ymm7,%ymm7 262 # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 263 vpaddd %ymm14,%ymm9,%ymm9 264 vpxor %ymm9,%ymm4,%ymm4 265 vpslld $7,%ymm4,%ymm0 266 vpsrld $25,%ymm4,%ymm4 267 vpor %ymm0,%ymm4,%ymm4 268 269 dec %ecx 270 jnz .Ldoubleround8 271 272 # x0..15[0-3] += s[0..15] 273 vpbroadcastd 0x00(%rdi),%ymm0 274 vpaddd 0x00(%rsp),%ymm0,%ymm0 275 vmovdqa %ymm0,0x00(%rsp) 276 vpbroadcastd 0x04(%rdi),%ymm0 277 vpaddd 0x20(%rsp),%ymm0,%ymm0 278 vmovdqa %ymm0,0x20(%rsp) 279 vpbroadcastd 0x08(%rdi),%ymm0 280 vpaddd 0x40(%rsp),%ymm0,%ymm0 281 vmovdqa %ymm0,0x40(%rsp) 282 vpbroadcastd 0x0c(%rdi),%ymm0 283 vpaddd 0x60(%rsp),%ymm0,%ymm0 284 vmovdqa %ymm0,0x60(%rsp) 285 vpbroadcastd 0x10(%rdi),%ymm0 286 vpaddd %ymm0,%ymm4,%ymm4 287 vpbroadcastd 0x14(%rdi),%ymm0 288 vpaddd %ymm0,%ymm5,%ymm5 289 vpbroadcastd 0x18(%rdi),%ymm0 290 vpaddd %ymm0,%ymm6,%ymm6 291 vpbroadcastd 0x1c(%rdi),%ymm0 292 vpaddd %ymm0,%ymm7,%ymm7 293 vpbroadcastd 0x20(%rdi),%ymm0 294 vpaddd %ymm0,%ymm8,%ymm8 295 vpbroadcastd 0x24(%rdi),%ymm0 296 vpaddd %ymm0,%ymm9,%ymm9 297 vpbroadcastd 0x28(%rdi),%ymm0 298 vpaddd %ymm0,%ymm10,%ymm10 299 vpbroadcastd 0x2c(%rdi),%ymm0 300 vpaddd %ymm0,%ymm11,%ymm11 301 vpbroadcastd 0x30(%rdi),%ymm0 302 vpaddd %ymm0,%ymm12,%ymm12 303 vpbroadcastd 0x34(%rdi),%ymm0 304 vpaddd %ymm0,%ymm13,%ymm13 305 vpbroadcastd 0x38(%rdi),%ymm0 306 vpaddd %ymm0,%ymm14,%ymm14 307 vpbroadcastd 0x3c(%rdi),%ymm0 308 vpaddd %ymm0,%ymm15,%ymm15 309 310 # x12 += counter values 0-3 311 vpaddd %ymm1,%ymm12,%ymm12 312 313 # interleave 32-bit words in state n, n+1 314 vmovdqa 0x00(%rsp),%ymm0 315 vmovdqa 0x20(%rsp),%ymm1 316 vpunpckldq %ymm1,%ymm0,%ymm2 317 vpunpckhdq %ymm1,%ymm0,%ymm1 318 vmovdqa %ymm2,0x00(%rsp) 319 vmovdqa %ymm1,0x20(%rsp) 320 vmovdqa 0x40(%rsp),%ymm0 321 vmovdqa 0x60(%rsp),%ymm1 322 vpunpckldq %ymm1,%ymm0,%ymm2 323 vpunpckhdq %ymm1,%ymm0,%ymm1 324 vmovdqa %ymm2,0x40(%rsp) 325 vmovdqa %ymm1,0x60(%rsp) 326 vmovdqa %ymm4,%ymm0 327 vpunpckldq %ymm5,%ymm0,%ymm4 328 vpunpckhdq %ymm5,%ymm0,%ymm5 329 vmovdqa %ymm6,%ymm0 330 vpunpckldq %ymm7,%ymm0,%ymm6 331 vpunpckhdq %ymm7,%ymm0,%ymm7 332 vmovdqa %ymm8,%ymm0 333 vpunpckldq %ymm9,%ymm0,%ymm8 334 vpunpckhdq %ymm9,%ymm0,%ymm9 335 vmovdqa %ymm10,%ymm0 336 vpunpckldq %ymm11,%ymm0,%ymm10 337 vpunpckhdq %ymm11,%ymm0,%ymm11 338 vmovdqa %ymm12,%ymm0 339 vpunpckldq %ymm13,%ymm0,%ymm12 340 vpunpckhdq %ymm13,%ymm0,%ymm13 341 vmovdqa %ymm14,%ymm0 342 vpunpckldq %ymm15,%ymm0,%ymm14 343 vpunpckhdq %ymm15,%ymm0,%ymm15 344 345 # interleave 64-bit words in state n, n+2 346 vmovdqa 0x00(%rsp),%ymm0 347 vmovdqa 0x40(%rsp),%ymm2 348 vpunpcklqdq %ymm2,%ymm0,%ymm1 349 vpunpckhqdq %ymm2,%ymm0,%ymm2 350 vmovdqa %ymm1,0x00(%rsp) 351 vmovdqa %ymm2,0x40(%rsp) 352 vmovdqa 0x20(%rsp),%ymm0 353 vmovdqa 0x60(%rsp),%ymm2 354 vpunpcklqdq %ymm2,%ymm0,%ymm1 355 vpunpckhqdq %ymm2,%ymm0,%ymm2 356 vmovdqa %ymm1,0x20(%rsp) 357 vmovdqa %ymm2,0x60(%rsp) 358 vmovdqa %ymm4,%ymm0 359 vpunpcklqdq %ymm6,%ymm0,%ymm4 360 vpunpckhqdq %ymm6,%ymm0,%ymm6 361 vmovdqa %ymm5,%ymm0 362 vpunpcklqdq %ymm7,%ymm0,%ymm5 363 vpunpckhqdq %ymm7,%ymm0,%ymm7 364 vmovdqa %ymm8,%ymm0 365 vpunpcklqdq %ymm10,%ymm0,%ymm8 366 vpunpckhqdq %ymm10,%ymm0,%ymm10 367 vmovdqa %ymm9,%ymm0 368 vpunpcklqdq %ymm11,%ymm0,%ymm9 369 vpunpckhqdq %ymm11,%ymm0,%ymm11 370 vmovdqa %ymm12,%ymm0 371 vpunpcklqdq %ymm14,%ymm0,%ymm12 372 vpunpckhqdq %ymm14,%ymm0,%ymm14 373 vmovdqa %ymm13,%ymm0 374 vpunpcklqdq %ymm15,%ymm0,%ymm13 375 vpunpckhqdq %ymm15,%ymm0,%ymm15 376 377 # interleave 128-bit words in state n, n+4 378 vmovdqa 0x00(%rsp),%ymm0 379 vperm2i128 $0x20,%ymm4,%ymm0,%ymm1 380 vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 381 vmovdqa %ymm1,0x00(%rsp) 382 vmovdqa 0x20(%rsp),%ymm0 383 vperm2i128 $0x20,%ymm5,%ymm0,%ymm1 384 vperm2i128 $0x31,%ymm5,%ymm0,%ymm5 385 vmovdqa %ymm1,0x20(%rsp) 386 vmovdqa 0x40(%rsp),%ymm0 387 vperm2i128 $0x20,%ymm6,%ymm0,%ymm1 388 vperm2i128 $0x31,%ymm6,%ymm0,%ymm6 389 vmovdqa %ymm1,0x40(%rsp) 390 vmovdqa 0x60(%rsp),%ymm0 391 vperm2i128 $0x20,%ymm7,%ymm0,%ymm1 392 vperm2i128 $0x31,%ymm7,%ymm0,%ymm7 393 vmovdqa %ymm1,0x60(%rsp) 394 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 395 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 396 vmovdqa %ymm0,%ymm8 397 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 398 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 399 vmovdqa %ymm0,%ymm9 400 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 401 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 402 vmovdqa %ymm0,%ymm10 403 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 404 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 405 vmovdqa %ymm0,%ymm11 406 407 # xor with corresponding input, write to output 408 vmovdqa 0x00(%rsp),%ymm0 409 vpxor 0x0000(%rdx),%ymm0,%ymm0 410 vmovdqu %ymm0,0x0000(%rsi) 411 vmovdqa 0x20(%rsp),%ymm0 412 vpxor 0x0080(%rdx),%ymm0,%ymm0 413 vmovdqu %ymm0,0x0080(%rsi) 414 vmovdqa 0x40(%rsp),%ymm0 415 vpxor 0x0040(%rdx),%ymm0,%ymm0 416 vmovdqu %ymm0,0x0040(%rsi) 417 vmovdqa 0x60(%rsp),%ymm0 418 vpxor 0x00c0(%rdx),%ymm0,%ymm0 419 vmovdqu %ymm0,0x00c0(%rsi) 420 vpxor 0x0100(%rdx),%ymm4,%ymm4 421 vmovdqu %ymm4,0x0100(%rsi) 422 vpxor 0x0180(%rdx),%ymm5,%ymm5 423 vmovdqu %ymm5,0x00180(%rsi) 424 vpxor 0x0140(%rdx),%ymm6,%ymm6 425 vmovdqu %ymm6,0x0140(%rsi) 426 vpxor 0x01c0(%rdx),%ymm7,%ymm7 427 vmovdqu %ymm7,0x01c0(%rsi) 428 vpxor 0x0020(%rdx),%ymm8,%ymm8 429 vmovdqu %ymm8,0x0020(%rsi) 430 vpxor 0x00a0(%rdx),%ymm9,%ymm9 431 vmovdqu %ymm9,0x00a0(%rsi) 432 vpxor 0x0060(%rdx),%ymm10,%ymm10 433 vmovdqu %ymm10,0x0060(%rsi) 434 vpxor 0x00e0(%rdx),%ymm11,%ymm11 435 vmovdqu %ymm11,0x00e0(%rsi) 436 vpxor 0x0120(%rdx),%ymm12,%ymm12 437 vmovdqu %ymm12,0x0120(%rsi) 438 vpxor 0x01a0(%rdx),%ymm13,%ymm13 439 vmovdqu %ymm13,0x01a0(%rsi) 440 vpxor 0x0160(%rdx),%ymm14,%ymm14 441 vmovdqu %ymm14,0x0160(%rsi) 442 vpxor 0x01e0(%rdx),%ymm15,%ymm15 443 vmovdqu %ymm15,0x01e0(%rsi) 444 445 vzeroupper 446 lea -8(%r10),%rsp 447 ret 448ENDPROC(chacha20_8block_xor_avx2) 449