1/* 2 * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions 3 * 4 * Copyright (C) 2015 Martin Willi 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 */ 11 12#include <linux/linkage.h> 13 14.section .rodata.cst32.ANMASK, "aM", @progbits, 32 15.align 32 16ANMASK: .octa 0x0000000003ffffff0000000003ffffff 17 .octa 0x0000000003ffffff0000000003ffffff 18 19.section .rodata.cst32.ORMASK, "aM", @progbits, 32 20.align 32 21ORMASK: .octa 0x00000000010000000000000001000000 22 .octa 0x00000000010000000000000001000000 23 24.text 25 26#define h0 0x00(%rdi) 27#define h1 0x04(%rdi) 28#define h2 0x08(%rdi) 29#define h3 0x0c(%rdi) 30#define h4 0x10(%rdi) 31#define r0 0x00(%rdx) 32#define r1 0x04(%rdx) 33#define r2 0x08(%rdx) 34#define r3 0x0c(%rdx) 35#define r4 0x10(%rdx) 36#define u0 0x00(%r8) 37#define u1 0x04(%r8) 38#define u2 0x08(%r8) 39#define u3 0x0c(%r8) 40#define u4 0x10(%r8) 41#define w0 0x14(%r8) 42#define w1 0x18(%r8) 43#define w2 0x1c(%r8) 44#define w3 0x20(%r8) 45#define w4 0x24(%r8) 46#define y0 0x28(%r8) 47#define y1 0x2c(%r8) 48#define y2 0x30(%r8) 49#define y3 0x34(%r8) 50#define y4 0x38(%r8) 51#define m %rsi 52#define hc0 %ymm0 53#define hc1 %ymm1 54#define hc2 %ymm2 55#define hc3 %ymm3 56#define hc4 %ymm4 57#define hc0x %xmm0 58#define hc1x %xmm1 59#define hc2x %xmm2 60#define hc3x %xmm3 61#define hc4x %xmm4 62#define t1 %ymm5 63#define t2 %ymm6 64#define t1x %xmm5 65#define t2x %xmm6 66#define ruwy0 %ymm7 67#define ruwy1 %ymm8 68#define ruwy2 %ymm9 69#define ruwy3 %ymm10 70#define ruwy4 %ymm11 71#define ruwy0x %xmm7 72#define ruwy1x %xmm8 73#define ruwy2x %xmm9 74#define ruwy3x %xmm10 75#define ruwy4x %xmm11 76#define svxz1 %ymm12 77#define svxz2 %ymm13 78#define svxz3 %ymm14 79#define svxz4 %ymm15 80#define d0 %r9 81#define d1 %r10 82#define d2 %r11 83#define d3 %r12 84#define d4 %r13 85 86ENTRY(poly1305_4block_avx2) 87 # %rdi: Accumulator h[5] 88 # %rsi: 64 byte input block m 89 # %rdx: Poly1305 key r[5] 90 # %rcx: Quadblock count 91 # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5], 92 93 # This four-block variant uses loop unrolled block processing. It 94 # requires 4 Poly1305 keys: r, r^2, r^3 and r^4: 95 # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r 96 97 vzeroupper 98 push %rbx 99 push %r12 100 push %r13 101 102 # combine r0,u0,w0,y0 103 vmovd y0,ruwy0x 104 vmovd w0,t1x 105 vpunpcklqdq t1,ruwy0,ruwy0 106 vmovd u0,t1x 107 vmovd r0,t2x 108 vpunpcklqdq t2,t1,t1 109 vperm2i128 $0x20,t1,ruwy0,ruwy0 110 111 # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5 112 vmovd y1,ruwy1x 113 vmovd w1,t1x 114 vpunpcklqdq t1,ruwy1,ruwy1 115 vmovd u1,t1x 116 vmovd r1,t2x 117 vpunpcklqdq t2,t1,t1 118 vperm2i128 $0x20,t1,ruwy1,ruwy1 119 vpslld $2,ruwy1,svxz1 120 vpaddd ruwy1,svxz1,svxz1 121 122 # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5 123 vmovd y2,ruwy2x 124 vmovd w2,t1x 125 vpunpcklqdq t1,ruwy2,ruwy2 126 vmovd u2,t1x 127 vmovd r2,t2x 128 vpunpcklqdq t2,t1,t1 129 vperm2i128 $0x20,t1,ruwy2,ruwy2 130 vpslld $2,ruwy2,svxz2 131 vpaddd ruwy2,svxz2,svxz2 132 133 # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5 134 vmovd y3,ruwy3x 135 vmovd w3,t1x 136 vpunpcklqdq t1,ruwy3,ruwy3 137 vmovd u3,t1x 138 vmovd r3,t2x 139 vpunpcklqdq t2,t1,t1 140 vperm2i128 $0x20,t1,ruwy3,ruwy3 141 vpslld $2,ruwy3,svxz3 142 vpaddd ruwy3,svxz3,svxz3 143 144 # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5 145 vmovd y4,ruwy4x 146 vmovd w4,t1x 147 vpunpcklqdq t1,ruwy4,ruwy4 148 vmovd u4,t1x 149 vmovd r4,t2x 150 vpunpcklqdq t2,t1,t1 151 vperm2i128 $0x20,t1,ruwy4,ruwy4 152 vpslld $2,ruwy4,svxz4 153 vpaddd ruwy4,svxz4,svxz4 154 155.Ldoblock4: 156 # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff, 157 # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0] 158 vmovd 0x00(m),hc0x 159 vmovd 0x10(m),t1x 160 vpunpcklqdq t1,hc0,hc0 161 vmovd 0x20(m),t1x 162 vmovd 0x30(m),t2x 163 vpunpcklqdq t2,t1,t1 164 vperm2i128 $0x20,t1,hc0,hc0 165 vpand ANMASK(%rip),hc0,hc0 166 vmovd h0,t1x 167 vpaddd t1,hc0,hc0 168 # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff, 169 # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1] 170 vmovd 0x03(m),hc1x 171 vmovd 0x13(m),t1x 172 vpunpcklqdq t1,hc1,hc1 173 vmovd 0x23(m),t1x 174 vmovd 0x33(m),t2x 175 vpunpcklqdq t2,t1,t1 176 vperm2i128 $0x20,t1,hc1,hc1 177 vpsrld $2,hc1,hc1 178 vpand ANMASK(%rip),hc1,hc1 179 vmovd h1,t1x 180 vpaddd t1,hc1,hc1 181 # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff, 182 # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2] 183 vmovd 0x06(m),hc2x 184 vmovd 0x16(m),t1x 185 vpunpcklqdq t1,hc2,hc2 186 vmovd 0x26(m),t1x 187 vmovd 0x36(m),t2x 188 vpunpcklqdq t2,t1,t1 189 vperm2i128 $0x20,t1,hc2,hc2 190 vpsrld $4,hc2,hc2 191 vpand ANMASK(%rip),hc2,hc2 192 vmovd h2,t1x 193 vpaddd t1,hc2,hc2 194 # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff, 195 # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3] 196 vmovd 0x09(m),hc3x 197 vmovd 0x19(m),t1x 198 vpunpcklqdq t1,hc3,hc3 199 vmovd 0x29(m),t1x 200 vmovd 0x39(m),t2x 201 vpunpcklqdq t2,t1,t1 202 vperm2i128 $0x20,t1,hc3,hc3 203 vpsrld $6,hc3,hc3 204 vpand ANMASK(%rip),hc3,hc3 205 vmovd h3,t1x 206 vpaddd t1,hc3,hc3 207 # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24), 208 # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4] 209 vmovd 0x0c(m),hc4x 210 vmovd 0x1c(m),t1x 211 vpunpcklqdq t1,hc4,hc4 212 vmovd 0x2c(m),t1x 213 vmovd 0x3c(m),t2x 214 vpunpcklqdq t2,t1,t1 215 vperm2i128 $0x20,t1,hc4,hc4 216 vpsrld $8,hc4,hc4 217 vpor ORMASK(%rip),hc4,hc4 218 vmovd h4,t1x 219 vpaddd t1,hc4,hc4 220 221 # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ] 222 vpmuludq hc0,ruwy0,t1 223 # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ] 224 vpmuludq hc1,svxz4,t2 225 vpaddq t2,t1,t1 226 # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ] 227 vpmuludq hc2,svxz3,t2 228 vpaddq t2,t1,t1 229 # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ] 230 vpmuludq hc3,svxz2,t2 231 vpaddq t2,t1,t1 232 # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ] 233 vpmuludq hc4,svxz1,t2 234 vpaddq t2,t1,t1 235 # d0 = t1[0] + t1[1] + t[2] + t[3] 236 vpermq $0xee,t1,t2 237 vpaddq t2,t1,t1 238 vpsrldq $8,t1,t2 239 vpaddq t2,t1,t1 240 vmovq t1x,d0 241 242 # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ] 243 vpmuludq hc0,ruwy1,t1 244 # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ] 245 vpmuludq hc1,ruwy0,t2 246 vpaddq t2,t1,t1 247 # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ] 248 vpmuludq hc2,svxz4,t2 249 vpaddq t2,t1,t1 250 # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ] 251 vpmuludq hc3,svxz3,t2 252 vpaddq t2,t1,t1 253 # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ] 254 vpmuludq hc4,svxz2,t2 255 vpaddq t2,t1,t1 256 # d1 = t1[0] + t1[1] + t1[3] + t1[4] 257 vpermq $0xee,t1,t2 258 vpaddq t2,t1,t1 259 vpsrldq $8,t1,t2 260 vpaddq t2,t1,t1 261 vmovq t1x,d1 262 263 # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ] 264 vpmuludq hc0,ruwy2,t1 265 # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ] 266 vpmuludq hc1,ruwy1,t2 267 vpaddq t2,t1,t1 268 # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ] 269 vpmuludq hc2,ruwy0,t2 270 vpaddq t2,t1,t1 271 # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ] 272 vpmuludq hc3,svxz4,t2 273 vpaddq t2,t1,t1 274 # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ] 275 vpmuludq hc4,svxz3,t2 276 vpaddq t2,t1,t1 277 # d2 = t1[0] + t1[1] + t1[2] + t1[3] 278 vpermq $0xee,t1,t2 279 vpaddq t2,t1,t1 280 vpsrldq $8,t1,t2 281 vpaddq t2,t1,t1 282 vmovq t1x,d2 283 284 # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ] 285 vpmuludq hc0,ruwy3,t1 286 # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ] 287 vpmuludq hc1,ruwy2,t2 288 vpaddq t2,t1,t1 289 # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ] 290 vpmuludq hc2,ruwy1,t2 291 vpaddq t2,t1,t1 292 # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ] 293 vpmuludq hc3,ruwy0,t2 294 vpaddq t2,t1,t1 295 # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ] 296 vpmuludq hc4,svxz4,t2 297 vpaddq t2,t1,t1 298 # d3 = t1[0] + t1[1] + t1[2] + t1[3] 299 vpermq $0xee,t1,t2 300 vpaddq t2,t1,t1 301 vpsrldq $8,t1,t2 302 vpaddq t2,t1,t1 303 vmovq t1x,d3 304 305 # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ] 306 vpmuludq hc0,ruwy4,t1 307 # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ] 308 vpmuludq hc1,ruwy3,t2 309 vpaddq t2,t1,t1 310 # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ] 311 vpmuludq hc2,ruwy2,t2 312 vpaddq t2,t1,t1 313 # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ] 314 vpmuludq hc3,ruwy1,t2 315 vpaddq t2,t1,t1 316 # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ] 317 vpmuludq hc4,ruwy0,t2 318 vpaddq t2,t1,t1 319 # d4 = t1[0] + t1[1] + t1[2] + t1[3] 320 vpermq $0xee,t1,t2 321 vpaddq t2,t1,t1 322 vpsrldq $8,t1,t2 323 vpaddq t2,t1,t1 324 vmovq t1x,d4 325 326 # d1 += d0 >> 26 327 mov d0,%rax 328 shr $26,%rax 329 add %rax,d1 330 # h0 = d0 & 0x3ffffff 331 mov d0,%rbx 332 and $0x3ffffff,%ebx 333 334 # d2 += d1 >> 26 335 mov d1,%rax 336 shr $26,%rax 337 add %rax,d2 338 # h1 = d1 & 0x3ffffff 339 mov d1,%rax 340 and $0x3ffffff,%eax 341 mov %eax,h1 342 343 # d3 += d2 >> 26 344 mov d2,%rax 345 shr $26,%rax 346 add %rax,d3 347 # h2 = d2 & 0x3ffffff 348 mov d2,%rax 349 and $0x3ffffff,%eax 350 mov %eax,h2 351 352 # d4 += d3 >> 26 353 mov d3,%rax 354 shr $26,%rax 355 add %rax,d4 356 # h3 = d3 & 0x3ffffff 357 mov d3,%rax 358 and $0x3ffffff,%eax 359 mov %eax,h3 360 361 # h0 += (d4 >> 26) * 5 362 mov d4,%rax 363 shr $26,%rax 364 lea (%eax,%eax,4),%eax 365 add %eax,%ebx 366 # h4 = d4 & 0x3ffffff 367 mov d4,%rax 368 and $0x3ffffff,%eax 369 mov %eax,h4 370 371 # h1 += h0 >> 26 372 mov %ebx,%eax 373 shr $26,%eax 374 add %eax,h1 375 # h0 = h0 & 0x3ffffff 376 andl $0x3ffffff,%ebx 377 mov %ebx,h0 378 379 add $0x40,m 380 dec %rcx 381 jnz .Ldoblock4 382 383 vzeroupper 384 pop %r13 385 pop %r12 386 pop %rbx 387 ret 388ENDPROC(poly1305_4block_avx2) 389