1/* Copyright (c) 2013, 2018, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) 28/* See strncmp-stub.c */ 29#else 30 31/* Assumptions: 32 * 33 * ARMv8-a, AArch64. 34 * MTE compatible. 35 */ 36 37#include "asmdefs.h" 38 39#define REP8_01 0x0101010101010101 40#define REP8_7f 0x7f7f7f7f7f7f7f7f 41 42/* Parameters and result. */ 43#define src1 x0 44#define src2 x1 45#define limit x2 46#define result x0 47 48/* Internal variables. */ 49#define data1 x3 50#define data1w w3 51#define data2 x4 52#define data2w w4 53#define has_nul x5 54#define diff x6 55#define syndrome x7 56#define tmp1 x8 57#define tmp2 x9 58#define tmp3 x10 59#define zeroones x11 60#define pos x12 61#define mask x13 62#define endloop x14 63#define count mask 64#define offset pos 65#define neg_offset x15 66 67/* Define endian dependent shift operations. 68 On big-endian early bytes are at MSB and on little-endian LSB. 69 LS_FW means shifting towards early bytes. 70 LS_BK means shifting towards later bytes. 71 */ 72#ifdef __AARCH64EB__ 73#define LS_FW lsl 74#define LS_BK lsr 75#else 76#define LS_FW lsr 77#define LS_BK lsl 78#endif 79 80ENTRY (strncmp) 81 PTR_ARG (0) 82 PTR_ARG (1) 83 SIZE_ARG (2) 84 cbz limit, L(ret0) 85 eor tmp1, src1, src2 86 mov zeroones, #REP8_01 87 tst tmp1, #7 88 and count, src1, #7 89 b.ne L(misaligned8) 90 cbnz count, L(mutual_align) 91 92 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 93 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 94 can be done in parallel across the entire word. */ 95 .p2align 4 96L(loop_aligned): 97 ldr data1, [src1], #8 98 ldr data2, [src2], #8 99L(start_realigned): 100 subs limit, limit, #8 101 sub tmp1, data1, zeroones 102 orr tmp2, data1, #REP8_7f 103 eor diff, data1, data2 /* Non-zero if differences found. */ 104 csinv endloop, diff, xzr, hi /* Last Dword or differences. */ 105 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 106 ccmp endloop, #0, #0, eq 107 b.eq L(loop_aligned) 108 /* End of main loop */ 109 110L(full_check): 111#ifndef __AARCH64EB__ 112 orr syndrome, diff, has_nul 113 add limit, limit, 8 /* Rewind limit to before last subs. */ 114L(syndrome_check): 115 /* Limit was reached. Check if the NUL byte or the difference 116 is before the limit. */ 117 rev syndrome, syndrome 118 rev data1, data1 119 clz pos, syndrome 120 rev data2, data2 121 lsl data1, data1, pos 122 cmp limit, pos, lsr #3 123 lsl data2, data2, pos 124 /* But we need to zero-extend (char is unsigned) the value and then 125 perform a signed 32-bit subtraction. */ 126 lsr data1, data1, #56 127 sub result, data1, data2, lsr #56 128 csel result, result, xzr, hi 129 ret 130#else 131 /* Not reached the limit, must have found the end or a diff. */ 132 tbz limit, #63, L(not_limit) 133 add tmp1, limit, 8 134 cbz limit, L(not_limit) 135 136 lsl limit, tmp1, #3 /* Bits -> bytes. */ 137 mov mask, #~0 138 lsr mask, mask, limit 139 bic data1, data1, mask 140 bic data2, data2, mask 141 142 /* Make sure that the NUL byte is marked in the syndrome. */ 143 orr has_nul, has_nul, mask 144 145L(not_limit): 146 /* For big-endian we cannot use the trick with the syndrome value 147 as carry-propagation can corrupt the upper bits if the trailing 148 bytes in the string contain 0x01. */ 149 /* However, if there is no NUL byte in the dword, we can generate 150 the result directly. We can't just subtract the bytes as the 151 MSB might be significant. */ 152 cbnz has_nul, 1f 153 cmp data1, data2 154 cset result, ne 155 cneg result, result, lo 156 ret 1571: 158 /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 159 rev tmp3, data1 160 sub tmp1, tmp3, zeroones 161 orr tmp2, tmp3, #REP8_7f 162 bic has_nul, tmp1, tmp2 163 rev has_nul, has_nul 164 orr syndrome, diff, has_nul 165 clz pos, syndrome 166 /* The most-significant-non-zero bit of the syndrome marks either the 167 first bit that is different, or the top bit of the first zero byte. 168 Shifting left now will bring the critical information into the 169 top bits. */ 170L(end_quick): 171 lsl data1, data1, pos 172 lsl data2, data2, pos 173 /* But we need to zero-extend (char is unsigned) the value and then 174 perform a signed 32-bit subtraction. */ 175 lsr data1, data1, #56 176 sub result, data1, data2, lsr #56 177 ret 178#endif 179 180L(mutual_align): 181 /* Sources are mutually aligned, but are not currently at an 182 alignment boundary. Round down the addresses and then mask off 183 the bytes that precede the start point. 184 We also need to adjust the limit calculations, but without 185 overflowing if the limit is near ULONG_MAX. */ 186 bic src1, src1, #7 187 bic src2, src2, #7 188 ldr data1, [src1], #8 189 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ 190 ldr data2, [src2], #8 191 mov tmp2, #~0 192 LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ 193 /* Adjust the limit and ensure it doesn't overflow. */ 194 adds limit, limit, count 195 csinv limit, limit, xzr, lo 196 orr data1, data1, tmp2 197 orr data2, data2, tmp2 198 b L(start_realigned) 199 200 .p2align 4 201 /* Don't bother with dwords for up to 16 bytes. */ 202L(misaligned8): 203 cmp limit, #16 204 b.hs L(try_misaligned_words) 205 206L(byte_loop): 207 /* Perhaps we can do better than this. */ 208 ldrb data1w, [src1], #1 209 ldrb data2w, [src2], #1 210 subs limit, limit, #1 211 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ 212 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 213 b.eq L(byte_loop) 214L(done): 215 sub result, data1, data2 216 ret 217 /* Align the SRC1 to a dword by doing a bytewise compare and then do 218 the dword loop. */ 219L(try_misaligned_words): 220 cbz count, L(src1_aligned) 221 222 neg count, count 223 and count, count, #7 224 sub limit, limit, count 225 226L(page_end_loop): 227 ldrb data1w, [src1], #1 228 ldrb data2w, [src2], #1 229 cmp data1w, #1 230 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 231 b.ne L(done) 232 subs count, count, #1 233 b.hi L(page_end_loop) 234 235 /* The following diagram explains the comparison of misaligned strings. 236 The bytes are shown in natural order. For little-endian, it is 237 reversed in the registers. The "x" bytes are before the string. 238 The "|" separates data that is loaded at one time. 239 src1 | a a a a a a a a | b b b c c c c c | . . . 240 src2 | x x x x x a a a a a a a a b b b | c c c c c . . . 241 242 After shifting in each step, the data looks like this: 243 STEP_A STEP_B STEP_C 244 data1 a a a a a a a a b b b c c c c c b b b c c c c c 245 data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c 246 247 The bytes with "0" are eliminated from the syndrome via mask. 248 249 Align SRC2 down to 16 bytes. This way we can read 16 bytes at a 250 time from SRC2. The comparison happens in 3 steps. After each step 251 the loop can exit, or read from SRC1 or SRC2. */ 252L(src1_aligned): 253 /* Calculate offset from 8 byte alignment to string start in bits. No 254 need to mask offset since shifts are ignoring upper bits. */ 255 lsl offset, src2, #3 256 bic src2, src2, #0xf 257 mov mask, -1 258 neg neg_offset, offset 259 ldr data1, [src1], #8 260 ldp tmp1, tmp2, [src2], #16 261 LS_BK mask, mask, neg_offset 262 and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ 263 /* Skip the first compare if data in tmp1 is irrelevant. */ 264 tbnz offset, 6, L(misaligned_mid_loop) 265 266L(loop_misaligned): 267 /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ 268 LS_FW data2, tmp1, offset 269 LS_BK tmp1, tmp2, neg_offset 270 subs limit, limit, #8 271 orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ 272 sub has_nul, data1, zeroones 273 eor diff, data1, data2 /* Non-zero if differences found. */ 274 orr tmp3, data1, #REP8_7f 275 csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ 276 bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ 277 orr tmp3, endloop, has_nul 278 cbnz tmp3, L(full_check) 279 280 ldr data1, [src1], #8 281L(misaligned_mid_loop): 282 /* STEP_B: Compare first part of data1 to second part of tmp2. */ 283 LS_FW data2, tmp2, offset 284#ifdef __AARCH64EB__ 285 /* For big-endian we do a byte reverse to avoid carry-propagation 286 problem described above. This way we can reuse the has_nul in the 287 next step and also use syndrome value trick at the end. */ 288 rev tmp3, data1 289 #define data1_fixed tmp3 290#else 291 #define data1_fixed data1 292#endif 293 sub has_nul, data1_fixed, zeroones 294 orr tmp3, data1_fixed, #REP8_7f 295 eor diff, data2, data1 /* Non-zero if differences found. */ 296 bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ 297#ifdef __AARCH64EB__ 298 rev has_nul, has_nul 299#endif 300 cmp limit, neg_offset, lsr #3 301 orr syndrome, diff, has_nul 302 bic syndrome, syndrome, mask /* Ignore later bytes. */ 303 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 304 cbnz tmp3, L(syndrome_check) 305 306 /* STEP_C: Compare second part of data1 to first part of tmp1. */ 307 ldp tmp1, tmp2, [src2], #16 308 cmp limit, #8 309 LS_BK data2, tmp1, neg_offset 310 eor diff, data2, data1 /* Non-zero if differences found. */ 311 orr syndrome, diff, has_nul 312 and syndrome, syndrome, mask /* Ignore earlier bytes. */ 313 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 314 cbnz tmp3, L(syndrome_check) 315 316 ldr data1, [src1], #8 317 sub limit, limit, #8 318 b L(loop_misaligned) 319 320#ifdef __AARCH64EB__ 321L(syndrome_check): 322 clz pos, syndrome 323 cmp pos, limit, lsl #3 324 b.lo L(end_quick) 325#endif 326 327L(ret0): 328 mov result, #0 329 ret 330END(strncmp) 331#endif 332