1/* 2 * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions 3 * 4 * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 * 10 * Based on: 11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions 12 * 13 * Copyright (C) 2015 Martin Willi 14 * 15 * This program is free software; you can redistribute it and/or modify 16 * it under the terms of the GNU General Public License as published by 17 * the Free Software Foundation; either version 2 of the License, or 18 * (at your option) any later version. 19 */ 20 21#include <linux/linkage.h> 22 23 .text 24 .align 6 25 26ENTRY(chacha20_block_xor_neon) 27 // x0: Input state matrix, s 28 // x1: 1 data block output, o 29 // x2: 1 data block input, i 30 31 // 32 // This function encrypts one ChaCha20 block by loading the state matrix 33 // in four NEON registers. It performs matrix operation on four words in 34 // parallel, but requires shuffling to rearrange the words after each 35 // round. 36 // 37 38 // x0..3 = s0..3 39 adr x3, ROT8 40 ld1 {v0.4s-v3.4s}, [x0] 41 ld1 {v8.4s-v11.4s}, [x0] 42 ld1 {v12.4s}, [x3] 43 44 mov x3, #10 45 46.Ldoubleround: 47 // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 48 add v0.4s, v0.4s, v1.4s 49 eor v3.16b, v3.16b, v0.16b 50 rev32 v3.8h, v3.8h 51 52 // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 53 add v2.4s, v2.4s, v3.4s 54 eor v4.16b, v1.16b, v2.16b 55 shl v1.4s, v4.4s, #12 56 sri v1.4s, v4.4s, #20 57 58 // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 59 add v0.4s, v0.4s, v1.4s 60 eor v3.16b, v3.16b, v0.16b 61 tbl v3.16b, {v3.16b}, v12.16b 62 63 // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 64 add v2.4s, v2.4s, v3.4s 65 eor v4.16b, v1.16b, v2.16b 66 shl v1.4s, v4.4s, #7 67 sri v1.4s, v4.4s, #25 68 69 // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 70 ext v1.16b, v1.16b, v1.16b, #4 71 // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 72 ext v2.16b, v2.16b, v2.16b, #8 73 // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 74 ext v3.16b, v3.16b, v3.16b, #12 75 76 // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 77 add v0.4s, v0.4s, v1.4s 78 eor v3.16b, v3.16b, v0.16b 79 rev32 v3.8h, v3.8h 80 81 // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 82 add v2.4s, v2.4s, v3.4s 83 eor v4.16b, v1.16b, v2.16b 84 shl v1.4s, v4.4s, #12 85 sri v1.4s, v4.4s, #20 86 87 // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 88 add v0.4s, v0.4s, v1.4s 89 eor v3.16b, v3.16b, v0.16b 90 tbl v3.16b, {v3.16b}, v12.16b 91 92 // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 93 add v2.4s, v2.4s, v3.4s 94 eor v4.16b, v1.16b, v2.16b 95 shl v1.4s, v4.4s, #7 96 sri v1.4s, v4.4s, #25 97 98 // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 99 ext v1.16b, v1.16b, v1.16b, #12 100 // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 101 ext v2.16b, v2.16b, v2.16b, #8 102 // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 103 ext v3.16b, v3.16b, v3.16b, #4 104 105 subs x3, x3, #1 106 b.ne .Ldoubleround 107 108 ld1 {v4.16b-v7.16b}, [x2] 109 110 // o0 = i0 ^ (x0 + s0) 111 add v0.4s, v0.4s, v8.4s 112 eor v0.16b, v0.16b, v4.16b 113 114 // o1 = i1 ^ (x1 + s1) 115 add v1.4s, v1.4s, v9.4s 116 eor v1.16b, v1.16b, v5.16b 117 118 // o2 = i2 ^ (x2 + s2) 119 add v2.4s, v2.4s, v10.4s 120 eor v2.16b, v2.16b, v6.16b 121 122 // o3 = i3 ^ (x3 + s3) 123 add v3.4s, v3.4s, v11.4s 124 eor v3.16b, v3.16b, v7.16b 125 126 st1 {v0.16b-v3.16b}, [x1] 127 128 ret 129ENDPROC(chacha20_block_xor_neon) 130 131 .align 6 132ENTRY(chacha20_4block_xor_neon) 133 // x0: Input state matrix, s 134 // x1: 4 data blocks output, o 135 // x2: 4 data blocks input, i 136 137 // 138 // This function encrypts four consecutive ChaCha20 blocks by loading 139 // the state matrix in NEON registers four times. The algorithm performs 140 // each operation on the corresponding word of each state matrix, hence 141 // requires no word shuffling. For final XORing step we transpose the 142 // matrix by interleaving 32- and then 64-bit words, which allows us to 143 // do XOR in NEON registers. 144 // 145 adr x3, CTRINC // ... and ROT8 146 ld1 {v30.4s-v31.4s}, [x3] 147 148 // x0..15[0-3] = s0..3[0..3] 149 mov x4, x0 150 ld4r { v0.4s- v3.4s}, [x4], #16 151 ld4r { v4.4s- v7.4s}, [x4], #16 152 ld4r { v8.4s-v11.4s}, [x4], #16 153 ld4r {v12.4s-v15.4s}, [x4] 154 155 // x12 += counter values 0-3 156 add v12.4s, v12.4s, v30.4s 157 158 mov x3, #10 159 160.Ldoubleround4: 161 // x0 += x4, x12 = rotl32(x12 ^ x0, 16) 162 // x1 += x5, x13 = rotl32(x13 ^ x1, 16) 163 // x2 += x6, x14 = rotl32(x14 ^ x2, 16) 164 // x3 += x7, x15 = rotl32(x15 ^ x3, 16) 165 add v0.4s, v0.4s, v4.4s 166 add v1.4s, v1.4s, v5.4s 167 add v2.4s, v2.4s, v6.4s 168 add v3.4s, v3.4s, v7.4s 169 170 eor v12.16b, v12.16b, v0.16b 171 eor v13.16b, v13.16b, v1.16b 172 eor v14.16b, v14.16b, v2.16b 173 eor v15.16b, v15.16b, v3.16b 174 175 rev32 v12.8h, v12.8h 176 rev32 v13.8h, v13.8h 177 rev32 v14.8h, v14.8h 178 rev32 v15.8h, v15.8h 179 180 // x8 += x12, x4 = rotl32(x4 ^ x8, 12) 181 // x9 += x13, x5 = rotl32(x5 ^ x9, 12) 182 // x10 += x14, x6 = rotl32(x6 ^ x10, 12) 183 // x11 += x15, x7 = rotl32(x7 ^ x11, 12) 184 add v8.4s, v8.4s, v12.4s 185 add v9.4s, v9.4s, v13.4s 186 add v10.4s, v10.4s, v14.4s 187 add v11.4s, v11.4s, v15.4s 188 189 eor v16.16b, v4.16b, v8.16b 190 eor v17.16b, v5.16b, v9.16b 191 eor v18.16b, v6.16b, v10.16b 192 eor v19.16b, v7.16b, v11.16b 193 194 shl v4.4s, v16.4s, #12 195 shl v5.4s, v17.4s, #12 196 shl v6.4s, v18.4s, #12 197 shl v7.4s, v19.4s, #12 198 199 sri v4.4s, v16.4s, #20 200 sri v5.4s, v17.4s, #20 201 sri v6.4s, v18.4s, #20 202 sri v7.4s, v19.4s, #20 203 204 // x0 += x4, x12 = rotl32(x12 ^ x0, 8) 205 // x1 += x5, x13 = rotl32(x13 ^ x1, 8) 206 // x2 += x6, x14 = rotl32(x14 ^ x2, 8) 207 // x3 += x7, x15 = rotl32(x15 ^ x3, 8) 208 add v0.4s, v0.4s, v4.4s 209 add v1.4s, v1.4s, v5.4s 210 add v2.4s, v2.4s, v6.4s 211 add v3.4s, v3.4s, v7.4s 212 213 eor v12.16b, v12.16b, v0.16b 214 eor v13.16b, v13.16b, v1.16b 215 eor v14.16b, v14.16b, v2.16b 216 eor v15.16b, v15.16b, v3.16b 217 218 tbl v12.16b, {v12.16b}, v31.16b 219 tbl v13.16b, {v13.16b}, v31.16b 220 tbl v14.16b, {v14.16b}, v31.16b 221 tbl v15.16b, {v15.16b}, v31.16b 222 223 // x8 += x12, x4 = rotl32(x4 ^ x8, 7) 224 // x9 += x13, x5 = rotl32(x5 ^ x9, 7) 225 // x10 += x14, x6 = rotl32(x6 ^ x10, 7) 226 // x11 += x15, x7 = rotl32(x7 ^ x11, 7) 227 add v8.4s, v8.4s, v12.4s 228 add v9.4s, v9.4s, v13.4s 229 add v10.4s, v10.4s, v14.4s 230 add v11.4s, v11.4s, v15.4s 231 232 eor v16.16b, v4.16b, v8.16b 233 eor v17.16b, v5.16b, v9.16b 234 eor v18.16b, v6.16b, v10.16b 235 eor v19.16b, v7.16b, v11.16b 236 237 shl v4.4s, v16.4s, #7 238 shl v5.4s, v17.4s, #7 239 shl v6.4s, v18.4s, #7 240 shl v7.4s, v19.4s, #7 241 242 sri v4.4s, v16.4s, #25 243 sri v5.4s, v17.4s, #25 244 sri v6.4s, v18.4s, #25 245 sri v7.4s, v19.4s, #25 246 247 // x0 += x5, x15 = rotl32(x15 ^ x0, 16) 248 // x1 += x6, x12 = rotl32(x12 ^ x1, 16) 249 // x2 += x7, x13 = rotl32(x13 ^ x2, 16) 250 // x3 += x4, x14 = rotl32(x14 ^ x3, 16) 251 add v0.4s, v0.4s, v5.4s 252 add v1.4s, v1.4s, v6.4s 253 add v2.4s, v2.4s, v7.4s 254 add v3.4s, v3.4s, v4.4s 255 256 eor v15.16b, v15.16b, v0.16b 257 eor v12.16b, v12.16b, v1.16b 258 eor v13.16b, v13.16b, v2.16b 259 eor v14.16b, v14.16b, v3.16b 260 261 rev32 v15.8h, v15.8h 262 rev32 v12.8h, v12.8h 263 rev32 v13.8h, v13.8h 264 rev32 v14.8h, v14.8h 265 266 // x10 += x15, x5 = rotl32(x5 ^ x10, 12) 267 // x11 += x12, x6 = rotl32(x6 ^ x11, 12) 268 // x8 += x13, x7 = rotl32(x7 ^ x8, 12) 269 // x9 += x14, x4 = rotl32(x4 ^ x9, 12) 270 add v10.4s, v10.4s, v15.4s 271 add v11.4s, v11.4s, v12.4s 272 add v8.4s, v8.4s, v13.4s 273 add v9.4s, v9.4s, v14.4s 274 275 eor v16.16b, v5.16b, v10.16b 276 eor v17.16b, v6.16b, v11.16b 277 eor v18.16b, v7.16b, v8.16b 278 eor v19.16b, v4.16b, v9.16b 279 280 shl v5.4s, v16.4s, #12 281 shl v6.4s, v17.4s, #12 282 shl v7.4s, v18.4s, #12 283 shl v4.4s, v19.4s, #12 284 285 sri v5.4s, v16.4s, #20 286 sri v6.4s, v17.4s, #20 287 sri v7.4s, v18.4s, #20 288 sri v4.4s, v19.4s, #20 289 290 // x0 += x5, x15 = rotl32(x15 ^ x0, 8) 291 // x1 += x6, x12 = rotl32(x12 ^ x1, 8) 292 // x2 += x7, x13 = rotl32(x13 ^ x2, 8) 293 // x3 += x4, x14 = rotl32(x14 ^ x3, 8) 294 add v0.4s, v0.4s, v5.4s 295 add v1.4s, v1.4s, v6.4s 296 add v2.4s, v2.4s, v7.4s 297 add v3.4s, v3.4s, v4.4s 298 299 eor v15.16b, v15.16b, v0.16b 300 eor v12.16b, v12.16b, v1.16b 301 eor v13.16b, v13.16b, v2.16b 302 eor v14.16b, v14.16b, v3.16b 303 304 tbl v15.16b, {v15.16b}, v31.16b 305 tbl v12.16b, {v12.16b}, v31.16b 306 tbl v13.16b, {v13.16b}, v31.16b 307 tbl v14.16b, {v14.16b}, v31.16b 308 309 // x10 += x15, x5 = rotl32(x5 ^ x10, 7) 310 // x11 += x12, x6 = rotl32(x6 ^ x11, 7) 311 // x8 += x13, x7 = rotl32(x7 ^ x8, 7) 312 // x9 += x14, x4 = rotl32(x4 ^ x9, 7) 313 add v10.4s, v10.4s, v15.4s 314 add v11.4s, v11.4s, v12.4s 315 add v8.4s, v8.4s, v13.4s 316 add v9.4s, v9.4s, v14.4s 317 318 eor v16.16b, v5.16b, v10.16b 319 eor v17.16b, v6.16b, v11.16b 320 eor v18.16b, v7.16b, v8.16b 321 eor v19.16b, v4.16b, v9.16b 322 323 shl v5.4s, v16.4s, #7 324 shl v6.4s, v17.4s, #7 325 shl v7.4s, v18.4s, #7 326 shl v4.4s, v19.4s, #7 327 328 sri v5.4s, v16.4s, #25 329 sri v6.4s, v17.4s, #25 330 sri v7.4s, v18.4s, #25 331 sri v4.4s, v19.4s, #25 332 333 subs x3, x3, #1 334 b.ne .Ldoubleround4 335 336 ld4r {v16.4s-v19.4s}, [x0], #16 337 ld4r {v20.4s-v23.4s}, [x0], #16 338 339 // x12 += counter values 0-3 340 add v12.4s, v12.4s, v30.4s 341 342 // x0[0-3] += s0[0] 343 // x1[0-3] += s0[1] 344 // x2[0-3] += s0[2] 345 // x3[0-3] += s0[3] 346 add v0.4s, v0.4s, v16.4s 347 add v1.4s, v1.4s, v17.4s 348 add v2.4s, v2.4s, v18.4s 349 add v3.4s, v3.4s, v19.4s 350 351 ld4r {v24.4s-v27.4s}, [x0], #16 352 ld4r {v28.4s-v31.4s}, [x0] 353 354 // x4[0-3] += s1[0] 355 // x5[0-3] += s1[1] 356 // x6[0-3] += s1[2] 357 // x7[0-3] += s1[3] 358 add v4.4s, v4.4s, v20.4s 359 add v5.4s, v5.4s, v21.4s 360 add v6.4s, v6.4s, v22.4s 361 add v7.4s, v7.4s, v23.4s 362 363 // x8[0-3] += s2[0] 364 // x9[0-3] += s2[1] 365 // x10[0-3] += s2[2] 366 // x11[0-3] += s2[3] 367 add v8.4s, v8.4s, v24.4s 368 add v9.4s, v9.4s, v25.4s 369 add v10.4s, v10.4s, v26.4s 370 add v11.4s, v11.4s, v27.4s 371 372 // x12[0-3] += s3[0] 373 // x13[0-3] += s3[1] 374 // x14[0-3] += s3[2] 375 // x15[0-3] += s3[3] 376 add v12.4s, v12.4s, v28.4s 377 add v13.4s, v13.4s, v29.4s 378 add v14.4s, v14.4s, v30.4s 379 add v15.4s, v15.4s, v31.4s 380 381 // interleave 32-bit words in state n, n+1 382 zip1 v16.4s, v0.4s, v1.4s 383 zip2 v17.4s, v0.4s, v1.4s 384 zip1 v18.4s, v2.4s, v3.4s 385 zip2 v19.4s, v2.4s, v3.4s 386 zip1 v20.4s, v4.4s, v5.4s 387 zip2 v21.4s, v4.4s, v5.4s 388 zip1 v22.4s, v6.4s, v7.4s 389 zip2 v23.4s, v6.4s, v7.4s 390 zip1 v24.4s, v8.4s, v9.4s 391 zip2 v25.4s, v8.4s, v9.4s 392 zip1 v26.4s, v10.4s, v11.4s 393 zip2 v27.4s, v10.4s, v11.4s 394 zip1 v28.4s, v12.4s, v13.4s 395 zip2 v29.4s, v12.4s, v13.4s 396 zip1 v30.4s, v14.4s, v15.4s 397 zip2 v31.4s, v14.4s, v15.4s 398 399 // interleave 64-bit words in state n, n+2 400 zip1 v0.2d, v16.2d, v18.2d 401 zip2 v4.2d, v16.2d, v18.2d 402 zip1 v8.2d, v17.2d, v19.2d 403 zip2 v12.2d, v17.2d, v19.2d 404 ld1 {v16.16b-v19.16b}, [x2], #64 405 406 zip1 v1.2d, v20.2d, v22.2d 407 zip2 v5.2d, v20.2d, v22.2d 408 zip1 v9.2d, v21.2d, v23.2d 409 zip2 v13.2d, v21.2d, v23.2d 410 ld1 {v20.16b-v23.16b}, [x2], #64 411 412 zip1 v2.2d, v24.2d, v26.2d 413 zip2 v6.2d, v24.2d, v26.2d 414 zip1 v10.2d, v25.2d, v27.2d 415 zip2 v14.2d, v25.2d, v27.2d 416 ld1 {v24.16b-v27.16b}, [x2], #64 417 418 zip1 v3.2d, v28.2d, v30.2d 419 zip2 v7.2d, v28.2d, v30.2d 420 zip1 v11.2d, v29.2d, v31.2d 421 zip2 v15.2d, v29.2d, v31.2d 422 ld1 {v28.16b-v31.16b}, [x2] 423 424 // xor with corresponding input, write to output 425 eor v16.16b, v16.16b, v0.16b 426 eor v17.16b, v17.16b, v1.16b 427 eor v18.16b, v18.16b, v2.16b 428 eor v19.16b, v19.16b, v3.16b 429 eor v20.16b, v20.16b, v4.16b 430 eor v21.16b, v21.16b, v5.16b 431 st1 {v16.16b-v19.16b}, [x1], #64 432 eor v22.16b, v22.16b, v6.16b 433 eor v23.16b, v23.16b, v7.16b 434 eor v24.16b, v24.16b, v8.16b 435 eor v25.16b, v25.16b, v9.16b 436 st1 {v20.16b-v23.16b}, [x1], #64 437 eor v26.16b, v26.16b, v10.16b 438 eor v27.16b, v27.16b, v11.16b 439 eor v28.16b, v28.16b, v12.16b 440 st1 {v24.16b-v27.16b}, [x1], #64 441 eor v29.16b, v29.16b, v13.16b 442 eor v30.16b, v30.16b, v14.16b 443 eor v31.16b, v31.16b, v15.16b 444 st1 {v28.16b-v31.16b}, [x1] 445 446 ret 447ENDPROC(chacha20_4block_xor_neon) 448 449CTRINC: .word 0, 1, 2, 3 450ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f 451