1/* 2 * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions 3 * 4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11/* GPL HEADER START 12 * 13 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 14 * 15 * This program is free software; you can redistribute it and/or modify 16 * it under the terms of the GNU General Public License version 2 only, 17 * as published by the Free Software Foundation. 18 * 19 * This program is distributed in the hope that it will be useful, but 20 * WITHOUT ANY WARRANTY; without even the implied warranty of 21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 * General Public License version 2 for more details (a copy is included 23 * in the LICENSE file that accompanied this code). 24 * 25 * You should have received a copy of the GNU General Public License 26 * version 2 along with this program; If not, see http://www.gnu.org/licenses 27 * 28 * Please visit http://www.xyratex.com/contact if you need additional 29 * information or have any questions. 30 * 31 * GPL HEADER END 32 */ 33 34/* 35 * Copyright 2012 Xyratex Technology Limited 36 * 37 * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 38 * calculation. 39 * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 40 * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 41 * at: 42 * http://www.intel.com/products/processor/manuals/ 43 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 44 * Volume 2B: Instruction Set Reference, N-Z 45 * 46 * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 47 * Alexander Boyko <Alexander_Boyko@xyratex.com> 48 */ 49 50#include <linux/linkage.h> 51#include <asm/assembler.h> 52 53 .section ".rodata", "a" 54 .align 6 55 .cpu generic+crypto+crc 56 57.Lcrc32_constants: 58 /* 59 * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 60 * #define CONSTANT_R1 0x154442bd4LL 61 * 62 * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 63 * #define CONSTANT_R2 0x1c6e41596LL 64 */ 65 .octa 0x00000001c6e415960000000154442bd4 66 67 /* 68 * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 69 * #define CONSTANT_R3 0x1751997d0LL 70 * 71 * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 72 * #define CONSTANT_R4 0x0ccaa009eLL 73 */ 74 .octa 0x00000000ccaa009e00000001751997d0 75 76 /* 77 * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 78 * #define CONSTANT_R5 0x163cd6124LL 79 */ 80 .quad 0x0000000163cd6124 81 .quad 0x00000000FFFFFFFF 82 83 /* 84 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 85 * 86 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` 87 * = 0x1F7011641LL 88 * #define CONSTANT_RU 0x1F7011641LL 89 */ 90 .octa 0x00000001F701164100000001DB710641 91 92.Lcrc32c_constants: 93 .octa 0x000000009e4addf800000000740eef02 94 .octa 0x000000014cd00bd600000000f20c0dfe 95 .quad 0x00000000dd45aab8 96 .quad 0x00000000FFFFFFFF 97 .octa 0x00000000dea713f10000000105ec76f0 98 99 vCONSTANT .req v0 100 dCONSTANT .req d0 101 qCONSTANT .req q0 102 103 BUF .req x19 104 LEN .req x20 105 CRC .req x21 106 CONST .req x22 107 108 vzr .req v9 109 110 /** 111 * Calculate crc32 112 * BUF - buffer 113 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63 114 * CRC - initial crc32 115 * return %eax crc32 116 * uint crc32_pmull_le(unsigned char const *buffer, 117 * size_t len, uint crc32) 118 */ 119 .text 120ENTRY(crc32_pmull_le) 121 adr_l x3, .Lcrc32_constants 122 b 0f 123 124ENTRY(crc32c_pmull_le) 125 adr_l x3, .Lcrc32c_constants 126 1270: frame_push 4, 64 128 129 mov BUF, x0 130 mov LEN, x1 131 mov CRC, x2 132 mov CONST, x3 133 134 bic LEN, LEN, #15 135 ld1 {v1.16b-v4.16b}, [BUF], #0x40 136 movi vzr.16b, #0 137 fmov dCONSTANT, CRC 138 eor v1.16b, v1.16b, vCONSTANT.16b 139 sub LEN, LEN, #0x40 140 cmp LEN, #0x40 141 b.lt less_64 142 143 ldr qCONSTANT, [CONST] 144 145loop_64: /* 64 bytes Full cache line folding */ 146 sub LEN, LEN, #0x40 147 148 pmull2 v5.1q, v1.2d, vCONSTANT.2d 149 pmull2 v6.1q, v2.2d, vCONSTANT.2d 150 pmull2 v7.1q, v3.2d, vCONSTANT.2d 151 pmull2 v8.1q, v4.2d, vCONSTANT.2d 152 153 pmull v1.1q, v1.1d, vCONSTANT.1d 154 pmull v2.1q, v2.1d, vCONSTANT.1d 155 pmull v3.1q, v3.1d, vCONSTANT.1d 156 pmull v4.1q, v4.1d, vCONSTANT.1d 157 158 eor v1.16b, v1.16b, v5.16b 159 ld1 {v5.16b}, [BUF], #0x10 160 eor v2.16b, v2.16b, v6.16b 161 ld1 {v6.16b}, [BUF], #0x10 162 eor v3.16b, v3.16b, v7.16b 163 ld1 {v7.16b}, [BUF], #0x10 164 eor v4.16b, v4.16b, v8.16b 165 ld1 {v8.16b}, [BUF], #0x10 166 167 eor v1.16b, v1.16b, v5.16b 168 eor v2.16b, v2.16b, v6.16b 169 eor v3.16b, v3.16b, v7.16b 170 eor v4.16b, v4.16b, v8.16b 171 172 cmp LEN, #0x40 173 b.lt less_64 174 175 if_will_cond_yield_neon 176 stp q1, q2, [sp, #.Lframe_local_offset] 177 stp q3, q4, [sp, #.Lframe_local_offset + 32] 178 do_cond_yield_neon 179 ldp q1, q2, [sp, #.Lframe_local_offset] 180 ldp q3, q4, [sp, #.Lframe_local_offset + 32] 181 ldr qCONSTANT, [CONST] 182 movi vzr.16b, #0 183 endif_yield_neon 184 b loop_64 185 186less_64: /* Folding cache line into 128bit */ 187 ldr qCONSTANT, [CONST, #16] 188 189 pmull2 v5.1q, v1.2d, vCONSTANT.2d 190 pmull v1.1q, v1.1d, vCONSTANT.1d 191 eor v1.16b, v1.16b, v5.16b 192 eor v1.16b, v1.16b, v2.16b 193 194 pmull2 v5.1q, v1.2d, vCONSTANT.2d 195 pmull v1.1q, v1.1d, vCONSTANT.1d 196 eor v1.16b, v1.16b, v5.16b 197 eor v1.16b, v1.16b, v3.16b 198 199 pmull2 v5.1q, v1.2d, vCONSTANT.2d 200 pmull v1.1q, v1.1d, vCONSTANT.1d 201 eor v1.16b, v1.16b, v5.16b 202 eor v1.16b, v1.16b, v4.16b 203 204 cbz LEN, fold_64 205 206loop_16: /* Folding rest buffer into 128bit */ 207 subs LEN, LEN, #0x10 208 209 ld1 {v2.16b}, [BUF], #0x10 210 pmull2 v5.1q, v1.2d, vCONSTANT.2d 211 pmull v1.1q, v1.1d, vCONSTANT.1d 212 eor v1.16b, v1.16b, v5.16b 213 eor v1.16b, v1.16b, v2.16b 214 215 b.ne loop_16 216 217fold_64: 218 /* perform the last 64 bit fold, also adds 32 zeroes 219 * to the input stream */ 220 ext v2.16b, v1.16b, v1.16b, #8 221 pmull2 v2.1q, v2.2d, vCONSTANT.2d 222 ext v1.16b, v1.16b, vzr.16b, #8 223 eor v1.16b, v1.16b, v2.16b 224 225 /* final 32-bit fold */ 226 ldr dCONSTANT, [CONST, #32] 227 ldr d3, [CONST, #40] 228 229 ext v2.16b, v1.16b, vzr.16b, #4 230 and v1.16b, v1.16b, v3.16b 231 pmull v1.1q, v1.1d, vCONSTANT.1d 232 eor v1.16b, v1.16b, v2.16b 233 234 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 235 ldr qCONSTANT, [CONST, #48] 236 237 and v2.16b, v1.16b, v3.16b 238 ext v2.16b, vzr.16b, v2.16b, #8 239 pmull2 v2.1q, v2.2d, vCONSTANT.2d 240 and v2.16b, v2.16b, v3.16b 241 pmull v2.1q, v2.1d, vCONSTANT.1d 242 eor v1.16b, v1.16b, v2.16b 243 mov w0, v1.s[1] 244 245 frame_pop 246 ret 247ENDPROC(crc32_pmull_le) 248ENDPROC(crc32c_pmull_le) 249 250 .macro __crc32, c 2510: subs x2, x2, #16 252 b.mi 8f 253 ldp x3, x4, [x1], #16 254CPU_BE( rev x3, x3 ) 255CPU_BE( rev x4, x4 ) 256 crc32\c\()x w0, w0, x3 257 crc32\c\()x w0, w0, x4 258 b.ne 0b 259 ret 260 2618: tbz x2, #3, 4f 262 ldr x3, [x1], #8 263CPU_BE( rev x3, x3 ) 264 crc32\c\()x w0, w0, x3 2654: tbz x2, #2, 2f 266 ldr w3, [x1], #4 267CPU_BE( rev w3, w3 ) 268 crc32\c\()w w0, w0, w3 2692: tbz x2, #1, 1f 270 ldrh w3, [x1], #2 271CPU_BE( rev16 w3, w3 ) 272 crc32\c\()h w0, w0, w3 2731: tbz x2, #0, 0f 274 ldrb w3, [x1] 275 crc32\c\()b w0, w0, w3 2760: ret 277 .endm 278 279 .align 5 280ENTRY(crc32_armv8_le) 281 __crc32 282ENDPROC(crc32_armv8_le) 283 284 .align 5 285ENTRY(crc32c_armv8_le) 286 __crc32 c 287ENDPROC(crc32c_armv8_le) 288