1/* 2 * ChaCha/XChaCha NEON helper functions 3 * 4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 * 10 * Originally based on: 11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions 12 * 13 * Copyright (C) 2015 Martin Willi 14 * 15 * This program is free software; you can redistribute it and/or modify 16 * it under the terms of the GNU General Public License as published by 17 * the Free Software Foundation; either version 2 of the License, or 18 * (at your option) any later version. 19 */ 20 21#include <linux/linkage.h> 22#include <asm/assembler.h> 23#include <asm/cache.h> 24 25 .text 26 .align 6 27 28/* 29 * chacha_permute - permute one block 30 * 31 * Permute one 64-byte block where the state matrix is stored in the four NEON 32 * registers v0-v3. It performs matrix operations on four words in parallel, 33 * but requires shuffling to rearrange the words after each round. 34 * 35 * The round count is given in w3. 36 * 37 * Clobbers: w3, x10, v4, v12 38 */ 39chacha_permute: 40 41 adr_l x10, ROT8 42 ld1 {v12.4s}, [x10] 43 44.Ldoubleround: 45 // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 46 add v0.4s, v0.4s, v1.4s 47 eor v3.16b, v3.16b, v0.16b 48 rev32 v3.8h, v3.8h 49 50 // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 51 add v2.4s, v2.4s, v3.4s 52 eor v4.16b, v1.16b, v2.16b 53 shl v1.4s, v4.4s, #12 54 sri v1.4s, v4.4s, #20 55 56 // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 57 add v0.4s, v0.4s, v1.4s 58 eor v3.16b, v3.16b, v0.16b 59 tbl v3.16b, {v3.16b}, v12.16b 60 61 // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 62 add v2.4s, v2.4s, v3.4s 63 eor v4.16b, v1.16b, v2.16b 64 shl v1.4s, v4.4s, #7 65 sri v1.4s, v4.4s, #25 66 67 // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 68 ext v1.16b, v1.16b, v1.16b, #4 69 // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 70 ext v2.16b, v2.16b, v2.16b, #8 71 // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 72 ext v3.16b, v3.16b, v3.16b, #12 73 74 // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 75 add v0.4s, v0.4s, v1.4s 76 eor v3.16b, v3.16b, v0.16b 77 rev32 v3.8h, v3.8h 78 79 // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 80 add v2.4s, v2.4s, v3.4s 81 eor v4.16b, v1.16b, v2.16b 82 shl v1.4s, v4.4s, #12 83 sri v1.4s, v4.4s, #20 84 85 // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 86 add v0.4s, v0.4s, v1.4s 87 eor v3.16b, v3.16b, v0.16b 88 tbl v3.16b, {v3.16b}, v12.16b 89 90 // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 91 add v2.4s, v2.4s, v3.4s 92 eor v4.16b, v1.16b, v2.16b 93 shl v1.4s, v4.4s, #7 94 sri v1.4s, v4.4s, #25 95 96 // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 97 ext v1.16b, v1.16b, v1.16b, #12 98 // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 99 ext v2.16b, v2.16b, v2.16b, #8 100 // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 101 ext v3.16b, v3.16b, v3.16b, #4 102 103 subs w3, w3, #2 104 b.ne .Ldoubleround 105 106 ret 107ENDPROC(chacha_permute) 108 109ENTRY(chacha_block_xor_neon) 110 // x0: Input state matrix, s 111 // x1: 1 data block output, o 112 // x2: 1 data block input, i 113 // w3: nrounds 114 115 stp x29, x30, [sp, #-16]! 116 mov x29, sp 117 118 // x0..3 = s0..3 119 ld1 {v0.4s-v3.4s}, [x0] 120 ld1 {v8.4s-v11.4s}, [x0] 121 122 bl chacha_permute 123 124 ld1 {v4.16b-v7.16b}, [x2] 125 126 // o0 = i0 ^ (x0 + s0) 127 add v0.4s, v0.4s, v8.4s 128 eor v0.16b, v0.16b, v4.16b 129 130 // o1 = i1 ^ (x1 + s1) 131 add v1.4s, v1.4s, v9.4s 132 eor v1.16b, v1.16b, v5.16b 133 134 // o2 = i2 ^ (x2 + s2) 135 add v2.4s, v2.4s, v10.4s 136 eor v2.16b, v2.16b, v6.16b 137 138 // o3 = i3 ^ (x3 + s3) 139 add v3.4s, v3.4s, v11.4s 140 eor v3.16b, v3.16b, v7.16b 141 142 st1 {v0.16b-v3.16b}, [x1] 143 144 ldp x29, x30, [sp], #16 145 ret 146ENDPROC(chacha_block_xor_neon) 147 148ENTRY(hchacha_block_neon) 149 // x0: Input state matrix, s 150 // x1: output (8 32-bit words) 151 // w2: nrounds 152 153 stp x29, x30, [sp, #-16]! 154 mov x29, sp 155 156 ld1 {v0.4s-v3.4s}, [x0] 157 158 mov w3, w2 159 bl chacha_permute 160 161 st1 {v0.4s}, [x1], #16 162 st1 {v3.4s}, [x1] 163 164 ldp x29, x30, [sp], #16 165 ret 166ENDPROC(hchacha_block_neon) 167 168 a0 .req w12 169 a1 .req w13 170 a2 .req w14 171 a3 .req w15 172 a4 .req w16 173 a5 .req w17 174 a6 .req w19 175 a7 .req w20 176 a8 .req w21 177 a9 .req w22 178 a10 .req w23 179 a11 .req w24 180 a12 .req w25 181 a13 .req w26 182 a14 .req w27 183 a15 .req w28 184 185 .align 6 186ENTRY(chacha_4block_xor_neon) 187 frame_push 10 188 189 // x0: Input state matrix, s 190 // x1: 4 data blocks output, o 191 // x2: 4 data blocks input, i 192 // w3: nrounds 193 // x4: byte count 194 195 adr_l x10, .Lpermute 196 and x5, x4, #63 197 add x10, x10, x5 198 add x11, x10, #64 199 200 // 201 // This function encrypts four consecutive ChaCha blocks by loading 202 // the state matrix in NEON registers four times. The algorithm performs 203 // each operation on the corresponding word of each state matrix, hence 204 // requires no word shuffling. For final XORing step we transpose the 205 // matrix by interleaving 32- and then 64-bit words, which allows us to 206 // do XOR in NEON registers. 207 // 208 // At the same time, a fifth block is encrypted in parallel using 209 // scalar registers 210 // 211 adr_l x9, CTRINC // ... and ROT8 212 ld1 {v30.4s-v31.4s}, [x9] 213 214 // x0..15[0-3] = s0..3[0..3] 215 add x8, x0, #16 216 ld4r { v0.4s- v3.4s}, [x0] 217 ld4r { v4.4s- v7.4s}, [x8], #16 218 ld4r { v8.4s-v11.4s}, [x8], #16 219 ld4r {v12.4s-v15.4s}, [x8] 220 221 mov a0, v0.s[0] 222 mov a1, v1.s[0] 223 mov a2, v2.s[0] 224 mov a3, v3.s[0] 225 mov a4, v4.s[0] 226 mov a5, v5.s[0] 227 mov a6, v6.s[0] 228 mov a7, v7.s[0] 229 mov a8, v8.s[0] 230 mov a9, v9.s[0] 231 mov a10, v10.s[0] 232 mov a11, v11.s[0] 233 mov a12, v12.s[0] 234 mov a13, v13.s[0] 235 mov a14, v14.s[0] 236 mov a15, v15.s[0] 237 238 // x12 += counter values 1-4 239 add v12.4s, v12.4s, v30.4s 240 241.Ldoubleround4: 242 // x0 += x4, x12 = rotl32(x12 ^ x0, 16) 243 // x1 += x5, x13 = rotl32(x13 ^ x1, 16) 244 // x2 += x6, x14 = rotl32(x14 ^ x2, 16) 245 // x3 += x7, x15 = rotl32(x15 ^ x3, 16) 246 add v0.4s, v0.4s, v4.4s 247 add a0, a0, a4 248 add v1.4s, v1.4s, v5.4s 249 add a1, a1, a5 250 add v2.4s, v2.4s, v6.4s 251 add a2, a2, a6 252 add v3.4s, v3.4s, v7.4s 253 add a3, a3, a7 254 255 eor v12.16b, v12.16b, v0.16b 256 eor a12, a12, a0 257 eor v13.16b, v13.16b, v1.16b 258 eor a13, a13, a1 259 eor v14.16b, v14.16b, v2.16b 260 eor a14, a14, a2 261 eor v15.16b, v15.16b, v3.16b 262 eor a15, a15, a3 263 264 rev32 v12.8h, v12.8h 265 ror a12, a12, #16 266 rev32 v13.8h, v13.8h 267 ror a13, a13, #16 268 rev32 v14.8h, v14.8h 269 ror a14, a14, #16 270 rev32 v15.8h, v15.8h 271 ror a15, a15, #16 272 273 // x8 += x12, x4 = rotl32(x4 ^ x8, 12) 274 // x9 += x13, x5 = rotl32(x5 ^ x9, 12) 275 // x10 += x14, x6 = rotl32(x6 ^ x10, 12) 276 // x11 += x15, x7 = rotl32(x7 ^ x11, 12) 277 add v8.4s, v8.4s, v12.4s 278 add a8, a8, a12 279 add v9.4s, v9.4s, v13.4s 280 add a9, a9, a13 281 add v10.4s, v10.4s, v14.4s 282 add a10, a10, a14 283 add v11.4s, v11.4s, v15.4s 284 add a11, a11, a15 285 286 eor v16.16b, v4.16b, v8.16b 287 eor a4, a4, a8 288 eor v17.16b, v5.16b, v9.16b 289 eor a5, a5, a9 290 eor v18.16b, v6.16b, v10.16b 291 eor a6, a6, a10 292 eor v19.16b, v7.16b, v11.16b 293 eor a7, a7, a11 294 295 shl v4.4s, v16.4s, #12 296 shl v5.4s, v17.4s, #12 297 shl v6.4s, v18.4s, #12 298 shl v7.4s, v19.4s, #12 299 300 sri v4.4s, v16.4s, #20 301 ror a4, a4, #20 302 sri v5.4s, v17.4s, #20 303 ror a5, a5, #20 304 sri v6.4s, v18.4s, #20 305 ror a6, a6, #20 306 sri v7.4s, v19.4s, #20 307 ror a7, a7, #20 308 309 // x0 += x4, x12 = rotl32(x12 ^ x0, 8) 310 // x1 += x5, x13 = rotl32(x13 ^ x1, 8) 311 // x2 += x6, x14 = rotl32(x14 ^ x2, 8) 312 // x3 += x7, x15 = rotl32(x15 ^ x3, 8) 313 add v0.4s, v0.4s, v4.4s 314 add a0, a0, a4 315 add v1.4s, v1.4s, v5.4s 316 add a1, a1, a5 317 add v2.4s, v2.4s, v6.4s 318 add a2, a2, a6 319 add v3.4s, v3.4s, v7.4s 320 add a3, a3, a7 321 322 eor v12.16b, v12.16b, v0.16b 323 eor a12, a12, a0 324 eor v13.16b, v13.16b, v1.16b 325 eor a13, a13, a1 326 eor v14.16b, v14.16b, v2.16b 327 eor a14, a14, a2 328 eor v15.16b, v15.16b, v3.16b 329 eor a15, a15, a3 330 331 tbl v12.16b, {v12.16b}, v31.16b 332 ror a12, a12, #24 333 tbl v13.16b, {v13.16b}, v31.16b 334 ror a13, a13, #24 335 tbl v14.16b, {v14.16b}, v31.16b 336 ror a14, a14, #24 337 tbl v15.16b, {v15.16b}, v31.16b 338 ror a15, a15, #24 339 340 // x8 += x12, x4 = rotl32(x4 ^ x8, 7) 341 // x9 += x13, x5 = rotl32(x5 ^ x9, 7) 342 // x10 += x14, x6 = rotl32(x6 ^ x10, 7) 343 // x11 += x15, x7 = rotl32(x7 ^ x11, 7) 344 add v8.4s, v8.4s, v12.4s 345 add a8, a8, a12 346 add v9.4s, v9.4s, v13.4s 347 add a9, a9, a13 348 add v10.4s, v10.4s, v14.4s 349 add a10, a10, a14 350 add v11.4s, v11.4s, v15.4s 351 add a11, a11, a15 352 353 eor v16.16b, v4.16b, v8.16b 354 eor a4, a4, a8 355 eor v17.16b, v5.16b, v9.16b 356 eor a5, a5, a9 357 eor v18.16b, v6.16b, v10.16b 358 eor a6, a6, a10 359 eor v19.16b, v7.16b, v11.16b 360 eor a7, a7, a11 361 362 shl v4.4s, v16.4s, #7 363 shl v5.4s, v17.4s, #7 364 shl v6.4s, v18.4s, #7 365 shl v7.4s, v19.4s, #7 366 367 sri v4.4s, v16.4s, #25 368 ror a4, a4, #25 369 sri v5.4s, v17.4s, #25 370 ror a5, a5, #25 371 sri v6.4s, v18.4s, #25 372 ror a6, a6, #25 373 sri v7.4s, v19.4s, #25 374 ror a7, a7, #25 375 376 // x0 += x5, x15 = rotl32(x15 ^ x0, 16) 377 // x1 += x6, x12 = rotl32(x12 ^ x1, 16) 378 // x2 += x7, x13 = rotl32(x13 ^ x2, 16) 379 // x3 += x4, x14 = rotl32(x14 ^ x3, 16) 380 add v0.4s, v0.4s, v5.4s 381 add a0, a0, a5 382 add v1.4s, v1.4s, v6.4s 383 add a1, a1, a6 384 add v2.4s, v2.4s, v7.4s 385 add a2, a2, a7 386 add v3.4s, v3.4s, v4.4s 387 add a3, a3, a4 388 389 eor v15.16b, v15.16b, v0.16b 390 eor a15, a15, a0 391 eor v12.16b, v12.16b, v1.16b 392 eor a12, a12, a1 393 eor v13.16b, v13.16b, v2.16b 394 eor a13, a13, a2 395 eor v14.16b, v14.16b, v3.16b 396 eor a14, a14, a3 397 398 rev32 v15.8h, v15.8h 399 ror a15, a15, #16 400 rev32 v12.8h, v12.8h 401 ror a12, a12, #16 402 rev32 v13.8h, v13.8h 403 ror a13, a13, #16 404 rev32 v14.8h, v14.8h 405 ror a14, a14, #16 406 407 // x10 += x15, x5 = rotl32(x5 ^ x10, 12) 408 // x11 += x12, x6 = rotl32(x6 ^ x11, 12) 409 // x8 += x13, x7 = rotl32(x7 ^ x8, 12) 410 // x9 += x14, x4 = rotl32(x4 ^ x9, 12) 411 add v10.4s, v10.4s, v15.4s 412 add a10, a10, a15 413 add v11.4s, v11.4s, v12.4s 414 add a11, a11, a12 415 add v8.4s, v8.4s, v13.4s 416 add a8, a8, a13 417 add v9.4s, v9.4s, v14.4s 418 add a9, a9, a14 419 420 eor v16.16b, v5.16b, v10.16b 421 eor a5, a5, a10 422 eor v17.16b, v6.16b, v11.16b 423 eor a6, a6, a11 424 eor v18.16b, v7.16b, v8.16b 425 eor a7, a7, a8 426 eor v19.16b, v4.16b, v9.16b 427 eor a4, a4, a9 428 429 shl v5.4s, v16.4s, #12 430 shl v6.4s, v17.4s, #12 431 shl v7.4s, v18.4s, #12 432 shl v4.4s, v19.4s, #12 433 434 sri v5.4s, v16.4s, #20 435 ror a5, a5, #20 436 sri v6.4s, v17.4s, #20 437 ror a6, a6, #20 438 sri v7.4s, v18.4s, #20 439 ror a7, a7, #20 440 sri v4.4s, v19.4s, #20 441 ror a4, a4, #20 442 443 // x0 += x5, x15 = rotl32(x15 ^ x0, 8) 444 // x1 += x6, x12 = rotl32(x12 ^ x1, 8) 445 // x2 += x7, x13 = rotl32(x13 ^ x2, 8) 446 // x3 += x4, x14 = rotl32(x14 ^ x3, 8) 447 add v0.4s, v0.4s, v5.4s 448 add a0, a0, a5 449 add v1.4s, v1.4s, v6.4s 450 add a1, a1, a6 451 add v2.4s, v2.4s, v7.4s 452 add a2, a2, a7 453 add v3.4s, v3.4s, v4.4s 454 add a3, a3, a4 455 456 eor v15.16b, v15.16b, v0.16b 457 eor a15, a15, a0 458 eor v12.16b, v12.16b, v1.16b 459 eor a12, a12, a1 460 eor v13.16b, v13.16b, v2.16b 461 eor a13, a13, a2 462 eor v14.16b, v14.16b, v3.16b 463 eor a14, a14, a3 464 465 tbl v15.16b, {v15.16b}, v31.16b 466 ror a15, a15, #24 467 tbl v12.16b, {v12.16b}, v31.16b 468 ror a12, a12, #24 469 tbl v13.16b, {v13.16b}, v31.16b 470 ror a13, a13, #24 471 tbl v14.16b, {v14.16b}, v31.16b 472 ror a14, a14, #24 473 474 // x10 += x15, x5 = rotl32(x5 ^ x10, 7) 475 // x11 += x12, x6 = rotl32(x6 ^ x11, 7) 476 // x8 += x13, x7 = rotl32(x7 ^ x8, 7) 477 // x9 += x14, x4 = rotl32(x4 ^ x9, 7) 478 add v10.4s, v10.4s, v15.4s 479 add a10, a10, a15 480 add v11.4s, v11.4s, v12.4s 481 add a11, a11, a12 482 add v8.4s, v8.4s, v13.4s 483 add a8, a8, a13 484 add v9.4s, v9.4s, v14.4s 485 add a9, a9, a14 486 487 eor v16.16b, v5.16b, v10.16b 488 eor a5, a5, a10 489 eor v17.16b, v6.16b, v11.16b 490 eor a6, a6, a11 491 eor v18.16b, v7.16b, v8.16b 492 eor a7, a7, a8 493 eor v19.16b, v4.16b, v9.16b 494 eor a4, a4, a9 495 496 shl v5.4s, v16.4s, #7 497 shl v6.4s, v17.4s, #7 498 shl v7.4s, v18.4s, #7 499 shl v4.4s, v19.4s, #7 500 501 sri v5.4s, v16.4s, #25 502 ror a5, a5, #25 503 sri v6.4s, v17.4s, #25 504 ror a6, a6, #25 505 sri v7.4s, v18.4s, #25 506 ror a7, a7, #25 507 sri v4.4s, v19.4s, #25 508 ror a4, a4, #25 509 510 subs w3, w3, #2 511 b.ne .Ldoubleround4 512 513 ld4r {v16.4s-v19.4s}, [x0], #16 514 ld4r {v20.4s-v23.4s}, [x0], #16 515 516 // x12 += counter values 0-3 517 add v12.4s, v12.4s, v30.4s 518 519 // x0[0-3] += s0[0] 520 // x1[0-3] += s0[1] 521 // x2[0-3] += s0[2] 522 // x3[0-3] += s0[3] 523 add v0.4s, v0.4s, v16.4s 524 mov w6, v16.s[0] 525 mov w7, v17.s[0] 526 add v1.4s, v1.4s, v17.4s 527 mov w8, v18.s[0] 528 mov w9, v19.s[0] 529 add v2.4s, v2.4s, v18.4s 530 add a0, a0, w6 531 add a1, a1, w7 532 add v3.4s, v3.4s, v19.4s 533 add a2, a2, w8 534 add a3, a3, w9 535CPU_BE( rev a0, a0 ) 536CPU_BE( rev a1, a1 ) 537CPU_BE( rev a2, a2 ) 538CPU_BE( rev a3, a3 ) 539 540 ld4r {v24.4s-v27.4s}, [x0], #16 541 ld4r {v28.4s-v31.4s}, [x0] 542 543 // x4[0-3] += s1[0] 544 // x5[0-3] += s1[1] 545 // x6[0-3] += s1[2] 546 // x7[0-3] += s1[3] 547 add v4.4s, v4.4s, v20.4s 548 mov w6, v20.s[0] 549 mov w7, v21.s[0] 550 add v5.4s, v5.4s, v21.4s 551 mov w8, v22.s[0] 552 mov w9, v23.s[0] 553 add v6.4s, v6.4s, v22.4s 554 add a4, a4, w6 555 add a5, a5, w7 556 add v7.4s, v7.4s, v23.4s 557 add a6, a6, w8 558 add a7, a7, w9 559CPU_BE( rev a4, a4 ) 560CPU_BE( rev a5, a5 ) 561CPU_BE( rev a6, a6 ) 562CPU_BE( rev a7, a7 ) 563 564 // x8[0-3] += s2[0] 565 // x9[0-3] += s2[1] 566 // x10[0-3] += s2[2] 567 // x11[0-3] += s2[3] 568 add v8.4s, v8.4s, v24.4s 569 mov w6, v24.s[0] 570 mov w7, v25.s[0] 571 add v9.4s, v9.4s, v25.4s 572 mov w8, v26.s[0] 573 mov w9, v27.s[0] 574 add v10.4s, v10.4s, v26.4s 575 add a8, a8, w6 576 add a9, a9, w7 577 add v11.4s, v11.4s, v27.4s 578 add a10, a10, w8 579 add a11, a11, w9 580CPU_BE( rev a8, a8 ) 581CPU_BE( rev a9, a9 ) 582CPU_BE( rev a10, a10 ) 583CPU_BE( rev a11, a11 ) 584 585 // x12[0-3] += s3[0] 586 // x13[0-3] += s3[1] 587 // x14[0-3] += s3[2] 588 // x15[0-3] += s3[3] 589 add v12.4s, v12.4s, v28.4s 590 mov w6, v28.s[0] 591 mov w7, v29.s[0] 592 add v13.4s, v13.4s, v29.4s 593 mov w8, v30.s[0] 594 mov w9, v31.s[0] 595 add v14.4s, v14.4s, v30.4s 596 add a12, a12, w6 597 add a13, a13, w7 598 add v15.4s, v15.4s, v31.4s 599 add a14, a14, w8 600 add a15, a15, w9 601CPU_BE( rev a12, a12 ) 602CPU_BE( rev a13, a13 ) 603CPU_BE( rev a14, a14 ) 604CPU_BE( rev a15, a15 ) 605 606 // interleave 32-bit words in state n, n+1 607 ldp w6, w7, [x2], #64 608 zip1 v16.4s, v0.4s, v1.4s 609 ldp w8, w9, [x2, #-56] 610 eor a0, a0, w6 611 zip2 v17.4s, v0.4s, v1.4s 612 eor a1, a1, w7 613 zip1 v18.4s, v2.4s, v3.4s 614 eor a2, a2, w8 615 zip2 v19.4s, v2.4s, v3.4s 616 eor a3, a3, w9 617 ldp w6, w7, [x2, #-48] 618 zip1 v20.4s, v4.4s, v5.4s 619 ldp w8, w9, [x2, #-40] 620 eor a4, a4, w6 621 zip2 v21.4s, v4.4s, v5.4s 622 eor a5, a5, w7 623 zip1 v22.4s, v6.4s, v7.4s 624 eor a6, a6, w8 625 zip2 v23.4s, v6.4s, v7.4s 626 eor a7, a7, w9 627 ldp w6, w7, [x2, #-32] 628 zip1 v24.4s, v8.4s, v9.4s 629 ldp w8, w9, [x2, #-24] 630 eor a8, a8, w6 631 zip2 v25.4s, v8.4s, v9.4s 632 eor a9, a9, w7 633 zip1 v26.4s, v10.4s, v11.4s 634 eor a10, a10, w8 635 zip2 v27.4s, v10.4s, v11.4s 636 eor a11, a11, w9 637 ldp w6, w7, [x2, #-16] 638 zip1 v28.4s, v12.4s, v13.4s 639 ldp w8, w9, [x2, #-8] 640 eor a12, a12, w6 641 zip2 v29.4s, v12.4s, v13.4s 642 eor a13, a13, w7 643 zip1 v30.4s, v14.4s, v15.4s 644 eor a14, a14, w8 645 zip2 v31.4s, v14.4s, v15.4s 646 eor a15, a15, w9 647 648 mov x3, #64 649 subs x5, x4, #128 650 add x6, x5, x2 651 csel x3, x3, xzr, ge 652 csel x2, x2, x6, ge 653 654 // interleave 64-bit words in state n, n+2 655 zip1 v0.2d, v16.2d, v18.2d 656 zip2 v4.2d, v16.2d, v18.2d 657 stp a0, a1, [x1], #64 658 zip1 v8.2d, v17.2d, v19.2d 659 zip2 v12.2d, v17.2d, v19.2d 660 stp a2, a3, [x1, #-56] 661 ld1 {v16.16b-v19.16b}, [x2], x3 662 663 subs x6, x4, #192 664 ccmp x3, xzr, #4, lt 665 add x7, x6, x2 666 csel x3, x3, xzr, eq 667 csel x2, x2, x7, eq 668 669 zip1 v1.2d, v20.2d, v22.2d 670 zip2 v5.2d, v20.2d, v22.2d 671 stp a4, a5, [x1, #-48] 672 zip1 v9.2d, v21.2d, v23.2d 673 zip2 v13.2d, v21.2d, v23.2d 674 stp a6, a7, [x1, #-40] 675 ld1 {v20.16b-v23.16b}, [x2], x3 676 677 subs x7, x4, #256 678 ccmp x3, xzr, #4, lt 679 add x8, x7, x2 680 csel x3, x3, xzr, eq 681 csel x2, x2, x8, eq 682 683 zip1 v2.2d, v24.2d, v26.2d 684 zip2 v6.2d, v24.2d, v26.2d 685 stp a8, a9, [x1, #-32] 686 zip1 v10.2d, v25.2d, v27.2d 687 zip2 v14.2d, v25.2d, v27.2d 688 stp a10, a11, [x1, #-24] 689 ld1 {v24.16b-v27.16b}, [x2], x3 690 691 subs x8, x4, #320 692 ccmp x3, xzr, #4, lt 693 add x9, x8, x2 694 csel x2, x2, x9, eq 695 696 zip1 v3.2d, v28.2d, v30.2d 697 zip2 v7.2d, v28.2d, v30.2d 698 stp a12, a13, [x1, #-16] 699 zip1 v11.2d, v29.2d, v31.2d 700 zip2 v15.2d, v29.2d, v31.2d 701 stp a14, a15, [x1, #-8] 702 ld1 {v28.16b-v31.16b}, [x2] 703 704 // xor with corresponding input, write to output 705 tbnz x5, #63, 0f 706 eor v16.16b, v16.16b, v0.16b 707 eor v17.16b, v17.16b, v1.16b 708 eor v18.16b, v18.16b, v2.16b 709 eor v19.16b, v19.16b, v3.16b 710 st1 {v16.16b-v19.16b}, [x1], #64 711 cbz x5, .Lout 712 713 tbnz x6, #63, 1f 714 eor v20.16b, v20.16b, v4.16b 715 eor v21.16b, v21.16b, v5.16b 716 eor v22.16b, v22.16b, v6.16b 717 eor v23.16b, v23.16b, v7.16b 718 st1 {v20.16b-v23.16b}, [x1], #64 719 cbz x6, .Lout 720 721 tbnz x7, #63, 2f 722 eor v24.16b, v24.16b, v8.16b 723 eor v25.16b, v25.16b, v9.16b 724 eor v26.16b, v26.16b, v10.16b 725 eor v27.16b, v27.16b, v11.16b 726 st1 {v24.16b-v27.16b}, [x1], #64 727 cbz x7, .Lout 728 729 tbnz x8, #63, 3f 730 eor v28.16b, v28.16b, v12.16b 731 eor v29.16b, v29.16b, v13.16b 732 eor v30.16b, v30.16b, v14.16b 733 eor v31.16b, v31.16b, v15.16b 734 st1 {v28.16b-v31.16b}, [x1] 735 736.Lout: frame_pop 737 ret 738 739 // fewer than 128 bytes of in/output 7400: ld1 {v8.16b}, [x10] 741 ld1 {v9.16b}, [x11] 742 movi v10.16b, #16 743 sub x2, x1, #64 744 add x1, x1, x5 745 ld1 {v16.16b-v19.16b}, [x2] 746 tbl v4.16b, {v0.16b-v3.16b}, v8.16b 747 tbx v20.16b, {v16.16b-v19.16b}, v9.16b 748 add v8.16b, v8.16b, v10.16b 749 add v9.16b, v9.16b, v10.16b 750 tbl v5.16b, {v0.16b-v3.16b}, v8.16b 751 tbx v21.16b, {v16.16b-v19.16b}, v9.16b 752 add v8.16b, v8.16b, v10.16b 753 add v9.16b, v9.16b, v10.16b 754 tbl v6.16b, {v0.16b-v3.16b}, v8.16b 755 tbx v22.16b, {v16.16b-v19.16b}, v9.16b 756 add v8.16b, v8.16b, v10.16b 757 add v9.16b, v9.16b, v10.16b 758 tbl v7.16b, {v0.16b-v3.16b}, v8.16b 759 tbx v23.16b, {v16.16b-v19.16b}, v9.16b 760 761 eor v20.16b, v20.16b, v4.16b 762 eor v21.16b, v21.16b, v5.16b 763 eor v22.16b, v22.16b, v6.16b 764 eor v23.16b, v23.16b, v7.16b 765 st1 {v20.16b-v23.16b}, [x1] 766 b .Lout 767 768 // fewer than 192 bytes of in/output 7691: ld1 {v8.16b}, [x10] 770 ld1 {v9.16b}, [x11] 771 movi v10.16b, #16 772 add x1, x1, x6 773 tbl v0.16b, {v4.16b-v7.16b}, v8.16b 774 tbx v20.16b, {v16.16b-v19.16b}, v9.16b 775 add v8.16b, v8.16b, v10.16b 776 add v9.16b, v9.16b, v10.16b 777 tbl v1.16b, {v4.16b-v7.16b}, v8.16b 778 tbx v21.16b, {v16.16b-v19.16b}, v9.16b 779 add v8.16b, v8.16b, v10.16b 780 add v9.16b, v9.16b, v10.16b 781 tbl v2.16b, {v4.16b-v7.16b}, v8.16b 782 tbx v22.16b, {v16.16b-v19.16b}, v9.16b 783 add v8.16b, v8.16b, v10.16b 784 add v9.16b, v9.16b, v10.16b 785 tbl v3.16b, {v4.16b-v7.16b}, v8.16b 786 tbx v23.16b, {v16.16b-v19.16b}, v9.16b 787 788 eor v20.16b, v20.16b, v0.16b 789 eor v21.16b, v21.16b, v1.16b 790 eor v22.16b, v22.16b, v2.16b 791 eor v23.16b, v23.16b, v3.16b 792 st1 {v20.16b-v23.16b}, [x1] 793 b .Lout 794 795 // fewer than 256 bytes of in/output 7962: ld1 {v4.16b}, [x10] 797 ld1 {v5.16b}, [x11] 798 movi v6.16b, #16 799 add x1, x1, x7 800 tbl v0.16b, {v8.16b-v11.16b}, v4.16b 801 tbx v24.16b, {v20.16b-v23.16b}, v5.16b 802 add v4.16b, v4.16b, v6.16b 803 add v5.16b, v5.16b, v6.16b 804 tbl v1.16b, {v8.16b-v11.16b}, v4.16b 805 tbx v25.16b, {v20.16b-v23.16b}, v5.16b 806 add v4.16b, v4.16b, v6.16b 807 add v5.16b, v5.16b, v6.16b 808 tbl v2.16b, {v8.16b-v11.16b}, v4.16b 809 tbx v26.16b, {v20.16b-v23.16b}, v5.16b 810 add v4.16b, v4.16b, v6.16b 811 add v5.16b, v5.16b, v6.16b 812 tbl v3.16b, {v8.16b-v11.16b}, v4.16b 813 tbx v27.16b, {v20.16b-v23.16b}, v5.16b 814 815 eor v24.16b, v24.16b, v0.16b 816 eor v25.16b, v25.16b, v1.16b 817 eor v26.16b, v26.16b, v2.16b 818 eor v27.16b, v27.16b, v3.16b 819 st1 {v24.16b-v27.16b}, [x1] 820 b .Lout 821 822 // fewer than 320 bytes of in/output 8233: ld1 {v4.16b}, [x10] 824 ld1 {v5.16b}, [x11] 825 movi v6.16b, #16 826 add x1, x1, x8 827 tbl v0.16b, {v12.16b-v15.16b}, v4.16b 828 tbx v28.16b, {v24.16b-v27.16b}, v5.16b 829 add v4.16b, v4.16b, v6.16b 830 add v5.16b, v5.16b, v6.16b 831 tbl v1.16b, {v12.16b-v15.16b}, v4.16b 832 tbx v29.16b, {v24.16b-v27.16b}, v5.16b 833 add v4.16b, v4.16b, v6.16b 834 add v5.16b, v5.16b, v6.16b 835 tbl v2.16b, {v12.16b-v15.16b}, v4.16b 836 tbx v30.16b, {v24.16b-v27.16b}, v5.16b 837 add v4.16b, v4.16b, v6.16b 838 add v5.16b, v5.16b, v6.16b 839 tbl v3.16b, {v12.16b-v15.16b}, v4.16b 840 tbx v31.16b, {v24.16b-v27.16b}, v5.16b 841 842 eor v28.16b, v28.16b, v0.16b 843 eor v29.16b, v29.16b, v1.16b 844 eor v30.16b, v30.16b, v2.16b 845 eor v31.16b, v31.16b, v3.16b 846 st1 {v28.16b-v31.16b}, [x1] 847 b .Lout 848ENDPROC(chacha_4block_xor_neon) 849 850 .section ".rodata", "a", %progbits 851 .align L1_CACHE_SHIFT 852.Lpermute: 853 .set .Li, 0 854 .rept 192 855 .byte (.Li - 64) 856 .set .Li, .Li + 1 857 .endr 858 859CTRINC: .word 1, 2, 3, 4 860ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f 861