1/* 2 * Copyright (c) 2012-2014 ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* Implementation of strcmp for ARMv7 when DSP instructions are 30 available. Use ldrd to support wider loads, provided the data 31 is sufficiently aligned. Use saturating arithmetic to optimize 32 the compares. */ 33 34 /* Build Options: 35 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first 36 byte in the string. If comparing completely random strings 37 the pre-check will save time, since there is a very high 38 probability of a mismatch in the first character: we save 39 significant overhead if this is the common case. However, 40 if strings are likely to be identical (eg because we're 41 verifying a hit in a hash table), then this check is largely 42 redundant. */ 43 44 /* This version uses Thumb-2 code. */ 45 .thumb 46 .syntax unified 47 48#include "arm_asm.h" 49 50/* Parameters and result. */ 51#define src1 r0 52#define src2 r1 53#define result r0 /* Overlaps src1. */ 54 55/* Internal variables. */ 56#define tmp1 r4 57#define tmp2 r5 58#define const_m1 r12 59 60/* Additional internal variables for 64-bit aligned data. */ 61#define data1a r2 62#define data1b r3 63#define data2a r6 64#define data2b r7 65#define syndrome_a tmp1 66#define syndrome_b tmp2 67 68/* Additional internal variables for 32-bit aligned data. */ 69#define data1 r2 70#define data2 r3 71#define syndrome tmp2 72 73 74 /* Macro to compute and return the result value for word-aligned 75 cases. */ 76 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 77#ifdef __ARM_BIG_ENDIAN 78 /* If data1 contains a zero byte, then syndrome will contain a 1 in 79 bit 7 of that byte. Otherwise, the highest set bit in the 80 syndrome will highlight the first different bit. It is therefore 81 sufficient to extract the eight bits starting with the syndrome 82 bit. */ 83 clz tmp1, \synd 84 lsl r1, \d2, tmp1 85 .if \restore_r6 86 ldrd r6, r7, [sp, #8] 87 .endif 88 .cfi_restore 6 89 .cfi_restore 7 90 lsl \d1, \d1, tmp1 91 .cfi_remember_state 92 lsr result, \d1, #24 93 ldrd r4, r5, [sp], #16 94 .cfi_restore 4 95 .cfi_restore 5 96 .cfi_adjust_cfa_offset -16 97 sub result, result, r1, lsr #24 98 epilogue push_ip=HAVE_PAC_LEAF 99#else 100 /* To use the big-endian trick we'd have to reverse all three words. 101 that's slower than this approach. */ 102 rev \synd, \synd 103 clz tmp1, \synd 104 bic tmp1, tmp1, #7 105 lsr r1, \d2, tmp1 106 .cfi_remember_state 107 .if \restore_r6 108 ldrd r6, r7, [sp, #8] 109 .endif 110 .cfi_restore 6 111 .cfi_restore 7 112 lsr \d1, \d1, tmp1 113 and result, \d1, #255 114 and r1, r1, #255 115 ldrd r4, r5, [sp], #16 116 .cfi_restore 4 117 .cfi_restore 5 118 .cfi_adjust_cfa_offset -16 119 sub result, result, r1 120 121 epilogue push_ip=HAVE_PAC_LEAF 122#endif 123 .endm 124 125 .text 126 .p2align 5 127def_fn strcmp 128 .fnstart 129 .cfi_sections .debug_frame 130 .cfi_startproc 131 prologue push_ip=HAVE_PAC_LEAF 132#ifndef STRCMP_NO_PRECHECK 133 ldrb r2, [src1] 134 ldrb r3, [src2] 135 cmp r2, #1 136 it cs 137 cmpcs r2, r3 138 bne .Lfastpath_exit 139#endif 140 strd r4, r5, [sp, #-16]! 141 .cfi_adjust_cfa_offset 16 142 .cfi_rel_offset 4, 0 143 .cfi_rel_offset 5, 4 144 orr tmp1, src1, src2 145 strd r6, r7, [sp, #8] 146 .cfi_rel_offset 6, 8 147 .cfi_rel_offset 7, 12 148 mvn const_m1, #0 149 lsl r2, tmp1, #29 150 cbz r2, .Lloop_aligned8 151 152.Lnot_aligned: 153 eor tmp1, src1, src2 154 tst tmp1, #7 155 bne .Lmisaligned8 156 157 /* Deal with mutual misalignment by aligning downwards and then 158 masking off the unwanted loaded data to prevent a difference. */ 159 and tmp1, src1, #7 160 bic src1, src1, #7 161 and tmp2, tmp1, #3 162 bic src2, src2, #7 163 lsl tmp2, tmp2, #3 /* Bytes -> bits. */ 164 ldrd data1a, data1b, [src1], #16 165 tst tmp1, #4 166 ldrd data2a, data2b, [src2], #16 167 /* In thumb code we can't use MVN with a register shift, but 168 we do have ORN. */ 169 S2HI tmp1, const_m1, tmp2 170 orn data1a, data1a, tmp1 171 orn data2a, data2a, tmp1 172 beq .Lstart_realigned8 173 orn data1b, data1b, tmp1 174 mov data1a, const_m1 175 orn data2b, data2b, tmp1 176 mov data2a, const_m1 177 b .Lstart_realigned8 178 179 /* Unwind the inner loop by a factor of 2, giving 16 bytes per 180 pass. */ 181 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ 182 .p2align 2 /* Always word aligned. */ 183.Lloop_aligned8: 184 ldrd data1a, data1b, [src1], #16 185 ldrd data2a, data2b, [src2], #16 186.Lstart_realigned8: 187 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 188 eor syndrome_a, data1a, data2a 189 sel syndrome_a, syndrome_a, const_m1 190 cbnz syndrome_a, .Ldiff_in_a 191 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 192 eor syndrome_b, data1b, data2b 193 sel syndrome_b, syndrome_b, const_m1 194 cbnz syndrome_b, .Ldiff_in_b 195 196 ldrd data1a, data1b, [src1, #-8] 197 ldrd data2a, data2b, [src2, #-8] 198 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 199 eor syndrome_a, data1a, data2a 200 sel syndrome_a, syndrome_a, const_m1 201 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 202 eor syndrome_b, data1b, data2b 203 sel syndrome_b, syndrome_b, const_m1 204 /* Can't use CBZ for backwards branch. */ 205 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ 206 beq .Lloop_aligned8 207 208.Ldiff_found: 209 cbnz syndrome_a, .Ldiff_in_a 210 211.Ldiff_in_b: 212 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 213 214.Ldiff_in_a: 215 .cfi_restore_state 216 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 217 218 .cfi_restore_state 219.Lmisaligned8: 220 tst tmp1, #3 221 bne .Lmisaligned4 222 ands tmp1, src1, #3 223 bne .Lmutual_align4 224 225 /* Unrolled by a factor of 2, to reduce the number of post-increment 226 operations. */ 227.Lloop_aligned4: 228 ldr data1, [src1], #8 229 ldr data2, [src2], #8 230.Lstart_realigned4: 231 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ 232 eor syndrome, data1, data2 233 sel syndrome, syndrome, const_m1 234 cbnz syndrome, .Laligned4_done 235 ldr data1, [src1, #-4] 236 ldr data2, [src2, #-4] 237 uadd8 syndrome, data1, const_m1 238 eor syndrome, data1, data2 239 sel syndrome, syndrome, const_m1 240 cmp syndrome, #0 241 beq .Lloop_aligned4 242 243.Laligned4_done: 244 strcmp_epilogue_aligned syndrome, data1, data2, 0 245 246.Lmutual_align4: 247 .cfi_restore_state 248 /* Deal with mutual misalignment by aligning downwards and then 249 masking off the unwanted loaded data to prevent a difference. */ 250 lsl tmp1, tmp1, #3 /* Bytes -> bits. */ 251 bic src1, src1, #3 252 ldr data1, [src1], #8 253 bic src2, src2, #3 254 ldr data2, [src2], #8 255 256 /* In thumb code we can't use MVN with a register shift, but 257 we do have ORN. */ 258 S2HI tmp1, const_m1, tmp1 259 orn data1, data1, tmp1 260 orn data2, data2, tmp1 261 b .Lstart_realigned4 262 263.Lmisaligned4: 264 ands tmp1, src1, #3 265 beq .Lsrc1_aligned 266 sub src2, src2, tmp1 267 bic src1, src1, #3 268 lsls tmp1, tmp1, #31 269 ldr data1, [src1], #4 270 beq .Laligned_m2 271 bcs .Laligned_m1 272 273#ifdef STRCMP_NO_PRECHECK 274 ldrb data2, [src2, #1] 275 uxtb tmp1, data1, ror #BYTE1_OFFSET 276 subs tmp1, tmp1, data2 277 bne .Lmisaligned_exit 278 cbz data2, .Lmisaligned_exit 279 280.Laligned_m2: 281 ldrb data2, [src2, #2] 282 uxtb tmp1, data1, ror #BYTE2_OFFSET 283 subs tmp1, tmp1, data2 284 bne .Lmisaligned_exit 285 cbz data2, .Lmisaligned_exit 286 287.Laligned_m1: 288 ldrb data2, [src2, #3] 289 uxtb tmp1, data1, ror #BYTE3_OFFSET 290 subs tmp1, tmp1, data2 291 bne .Lmisaligned_exit 292 add src2, src2, #4 293 cbnz data2, .Lsrc1_aligned 294#else /* STRCMP_NO_PRECHECK */ 295 /* If we've done the pre-check, then we don't need to check the 296 first byte again here. */ 297 ldrb data2, [src2, #2] 298 uxtb tmp1, data1, ror #BYTE2_OFFSET 299 subs tmp1, tmp1, data2 300 bne .Lmisaligned_exit 301 cbz data2, .Lmisaligned_exit 302 303.Laligned_m2: 304 ldrb data2, [src2, #3] 305 uxtb tmp1, data1, ror #BYTE3_OFFSET 306 subs tmp1, tmp1, data2 307 bne .Lmisaligned_exit 308 cbnz data2, .Laligned_m1 309#endif 310 311.Lmisaligned_exit: 312 .cfi_remember_state 313 mov result, tmp1 314 ldr r4, [sp], #16 315 .cfi_restore 4 316 .cfi_adjust_cfa_offset -16 317 epilogue push_ip=HAVE_PAC_LEAF 318 319#ifndef STRCMP_NO_PRECHECK 320.Lfastpath_exit: 321 .cfi_restore_state 322 .cfi_remember_state 323 sub r0, r2, r3 324 epilogue push_ip=HAVE_PAC_LEAF 325 326.Laligned_m1: 327 .cfi_restore_state 328 .cfi_remember_state 329 add src2, src2, #4 330#endif 331.Lsrc1_aligned: 332 .cfi_restore_state 333 /* src1 is word aligned, but src2 has no common alignment 334 with it. */ 335 ldr data1, [src1], #4 336 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ 337 338 bic src2, src2, #3 339 ldr data2, [src2], #4 340 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ 341 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ 342 343 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ 344.Loverlap3: 345 bic tmp1, data1, #MSB 346 uadd8 syndrome, data1, const_m1 347 eors syndrome, tmp1, data2, S2LO #8 348 sel syndrome, syndrome, const_m1 349 bne 4f 350 cbnz syndrome, 5f 351 ldr data2, [src2], #4 352 eor tmp1, tmp1, data1 353 cmp tmp1, data2, S2HI #24 354 bne 6f 355 ldr data1, [src1], #4 356 b .Loverlap3 3574: 358 S2LO data2, data2, #8 359 b .Lstrcmp_tail 360 3615: 362 bics syndrome, syndrome, #MSB 363 bne .Lstrcmp_done_equal 364 365 /* We can only get here if the MSB of data1 contains 0, so 366 fast-path the exit. */ 367 ldrb result, [src2] 368 .cfi_remember_state 369 ldrd r4, r5, [sp], #16 370 .cfi_restore 4 371 .cfi_restore 5 372 /* R6/7 Not used in this sequence. */ 373 .cfi_restore 6 374 .cfi_restore 7 375 .cfi_adjust_cfa_offset -16 376 neg result, result 377 epilogue push_ip=HAVE_PAC_LEAF 378 3796: 380 .cfi_restore_state 381 S2LO data1, data1, #24 382 and data2, data2, #LSB 383 b .Lstrcmp_tail 384 385 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 386.Loverlap2: 387 and tmp1, data1, const_m1, S2LO #16 388 uadd8 syndrome, data1, const_m1 389 eors syndrome, tmp1, data2, S2LO #16 390 sel syndrome, syndrome, const_m1 391 bne 4f 392 cbnz syndrome, 5f 393 ldr data2, [src2], #4 394 eor tmp1, tmp1, data1 395 cmp tmp1, data2, S2HI #16 396 bne 6f 397 ldr data1, [src1], #4 398 b .Loverlap2 3994: 400 S2LO data2, data2, #16 401 b .Lstrcmp_tail 4025: 403 ands syndrome, syndrome, const_m1, S2LO #16 404 bne .Lstrcmp_done_equal 405 406 ldrh data2, [src2] 407 S2LO data1, data1, #16 408#ifdef __ARM_BIG_ENDIAN 409 lsl data2, data2, #16 410#endif 411 b .Lstrcmp_tail 412 4136: 414 S2LO data1, data1, #16 415 and data2, data2, const_m1, S2LO #16 416 b .Lstrcmp_tail 417 418 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 419.Loverlap1: 420 and tmp1, data1, #LSB 421 uadd8 syndrome, data1, const_m1 422 eors syndrome, tmp1, data2, S2LO #24 423 sel syndrome, syndrome, const_m1 424 bne 4f 425 cbnz syndrome, 5f 426 ldr data2, [src2], #4 427 eor tmp1, tmp1, data1 428 cmp tmp1, data2, S2HI #8 429 bne 6f 430 ldr data1, [src1], #4 431 b .Loverlap1 4324: 433 S2LO data2, data2, #24 434 b .Lstrcmp_tail 4355: 436 tst syndrome, #LSB 437 bne .Lstrcmp_done_equal 438 ldr data2, [src2] 4396: 440 S2LO data1, data1, #8 441 bic data2, data2, #MSB 442 b .Lstrcmp_tail 443 444.Lstrcmp_done_equal: 445 mov result, #0 446 .cfi_remember_state 447 ldrd r4, r5, [sp], #16 448 .cfi_restore 4 449 .cfi_restore 5 450 /* R6/7 not used in this sequence. */ 451 .cfi_restore 6 452 .cfi_restore 7 453 .cfi_adjust_cfa_offset -16 454 epilogue push_ip=HAVE_PAC_LEAF 455 456.Lstrcmp_tail: 457 .cfi_restore_state 458#ifndef __ARM_BIG_ENDIAN 459 rev data1, data1 460 rev data2, data2 461 /* Now everything looks big-endian... */ 462#endif 463 uadd8 tmp1, data1, const_m1 464 eor tmp1, data1, data2 465 sel syndrome, tmp1, const_m1 466 clz tmp1, syndrome 467 lsl data1, data1, tmp1 468 lsl data2, data2, tmp1 469 lsr result, data1, #24 470 ldrd r4, r5, [sp], #16 471 .cfi_restore 4 472 .cfi_restore 5 473 /* R6/7 not used in this sequence. */ 474 .cfi_restore 6 475 .cfi_restore 7 476 .cfi_adjust_cfa_offset -16 477 sub result, result, data2, lsr #24 478 epilogue push_ip=HAVE_PAC_LEAF 479 .cfi_endproc 480 .cantunwind 481 .fnend 482 .size strcmp, . - strcmp 483