1/* 2 * Copyright (c) 2012-2014 ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* Implementation of strcmp for ARMv7 when DSP instructions are 30 available. Use ldrd to support wider loads, provided the data 31 is sufficiently aligned. Use saturating arithmetic to optimize 32 the compares. */ 33 34 /* Build Options: 35 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first 36 byte in the string. If comparing completely random strings 37 the pre-check will save time, since there is a very high 38 probability of a mismatch in the first character: we save 39 significant overhead if this is the common case. However, 40 if strings are likely to be identical (eg because we're 41 verifying a hit in a hash table), then this check is largely 42 redundant. */ 43 44 /* This version uses Thumb-2 code. */ 45#include <picolibc.h> 46 47 .thumb 48 .syntax unified 49 50#include "arm_asm.h" 51 52/* Parameters and result. */ 53#define src1 r0 54#define src2 r1 55#define result r0 /* Overlaps src1. */ 56 57/* Internal variables. */ 58#define tmp1 r4 59#define tmp2 r5 60#define const_m1 r12 61 62/* Additional internal variables for 64-bit aligned data. */ 63#define data1a r2 64#define data1b r3 65#define data2a r6 66#define data2b r7 67#define syndrome_a tmp1 68#define syndrome_b tmp2 69 70/* Additional internal variables for 32-bit aligned data. */ 71#define data1 r2 72#define data2 r3 73#define syndrome tmp2 74 75 76 /* Macro to compute and return the result value for word-aligned 77 cases. */ 78 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 79#ifdef __ARM_BIG_ENDIAN 80 /* If data1 contains a zero byte, then syndrome will contain a 1 in 81 bit 7 of that byte. Otherwise, the highest set bit in the 82 syndrome will highlight the first different bit. It is therefore 83 sufficient to extract the eight bits starting with the syndrome 84 bit. */ 85 clz tmp1, \synd 86 lsl r1, \d2, tmp1 87 .if \restore_r6 88 ldrd r6, r7, [sp, #8] 89 .endif 90 .cfi_restore 6 91 .cfi_restore 7 92 lsl \d1, \d1, tmp1 93 .cfi_remember_state 94 lsr result, \d1, #24 95 ldrd r4, r5, [sp], #16 96 .cfi_restore 4 97 .cfi_restore 5 98 .cfi_adjust_cfa_offset -16 99 sub result, result, r1, lsr #24 100 epilogue push_ip=HAVE_PAC_LEAF 101#else 102 /* To use the big-endian trick we'd have to reverse all three words. 103 that's slower than this approach. */ 104 rev \synd, \synd 105 clz tmp1, \synd 106 bic tmp1, tmp1, #7 107 lsr r1, \d2, tmp1 108 .cfi_remember_state 109 .if \restore_r6 110 ldrd r6, r7, [sp, #8] 111 .endif 112 .cfi_restore 6 113 .cfi_restore 7 114 lsr \d1, \d1, tmp1 115 and result, \d1, #255 116 and r1, r1, #255 117 ldrd r4, r5, [sp], #16 118 .cfi_restore 4 119 .cfi_restore 5 120 .cfi_adjust_cfa_offset -16 121 sub result, result, r1 122 123 epilogue push_ip=HAVE_PAC_LEAF 124#endif 125 .endm 126 127 .text 128 .p2align 5 129def_fn strcmp 130 .fnstart 131 .cfi_sections .debug_frame 132 .cfi_startproc 133 prologue push_ip=HAVE_PAC_LEAF 134#ifndef STRCMP_NO_PRECHECK 135 ldrb r2, [src1] 136 ldrb r3, [src2] 137 cmp r2, #1 138 it cs 139 cmpcs r2, r3 140 bne .Lfastpath_exit 141#endif 142 strd r4, r5, [sp, #-16]! 143 .cfi_adjust_cfa_offset 16 144 .cfi_rel_offset 4, 0 145 .cfi_rel_offset 5, 4 146 orr tmp1, src1, src2 147 strd r6, r7, [sp, #8] 148 .cfi_rel_offset 6, 8 149 .cfi_rel_offset 7, 12 150 mvn const_m1, #0 151 lsl r2, tmp1, #29 152 cbz r2, .Lloop_aligned8 153 154.Lnot_aligned: 155 eor tmp1, src1, src2 156 tst tmp1, #7 157 bne .Lmisaligned8 158 159 /* Deal with mutual misalignment by aligning downwards and then 160 masking off the unwanted loaded data to prevent a difference. */ 161 and tmp1, src1, #7 162 bic src1, src1, #7 163 and tmp2, tmp1, #3 164 bic src2, src2, #7 165 lsl tmp2, tmp2, #3 /* Bytes -> bits. */ 166 ldrd data1a, data1b, [src1], #16 167 tst tmp1, #4 168 ldrd data2a, data2b, [src2], #16 169 /* In thumb code we can't use MVN with a register shift, but 170 we do have ORN. */ 171 S2HI tmp1, const_m1, tmp2 172 orn data1a, data1a, tmp1 173 orn data2a, data2a, tmp1 174 beq .Lstart_realigned8 175 orn data1b, data1b, tmp1 176 mov data1a, const_m1 177 orn data2b, data2b, tmp1 178 mov data2a, const_m1 179 b .Lstart_realigned8 180 181 /* Unwind the inner loop by a factor of 2, giving 16 bytes per 182 pass. */ 183 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ 184 .p2align 2 /* Always word aligned. */ 185.Lloop_aligned8: 186 ldrd data1a, data1b, [src1], #16 187 ldrd data2a, data2b, [src2], #16 188.Lstart_realigned8: 189 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 190 eor syndrome_a, data1a, data2a 191 sel syndrome_a, syndrome_a, const_m1 192 cbnz syndrome_a, .Ldiff_in_a 193 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 194 eor syndrome_b, data1b, data2b 195 sel syndrome_b, syndrome_b, const_m1 196 cbnz syndrome_b, .Ldiff_in_b 197 198 ldrd data1a, data1b, [src1, #-8] 199 ldrd data2a, data2b, [src2, #-8] 200 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 201 eor syndrome_a, data1a, data2a 202 sel syndrome_a, syndrome_a, const_m1 203 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 204 eor syndrome_b, data1b, data2b 205 sel syndrome_b, syndrome_b, const_m1 206 /* Can't use CBZ for backwards branch. */ 207 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ 208 beq .Lloop_aligned8 209 210.Ldiff_found: 211 cbnz syndrome_a, .Ldiff_in_a 212 213.Ldiff_in_b: 214 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 215 216.Ldiff_in_a: 217 .cfi_restore_state 218 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 219 220 .cfi_restore_state 221.Lmisaligned8: 222 tst tmp1, #3 223 bne .Lmisaligned4 224 ands tmp1, src1, #3 225 bne .Lmutual_align4 226 227 /* Unrolled by a factor of 2, to reduce the number of post-increment 228 operations. */ 229.Lloop_aligned4: 230 ldr data1, [src1], #8 231 ldr data2, [src2], #8 232.Lstart_realigned4: 233 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ 234 eor syndrome, data1, data2 235 sel syndrome, syndrome, const_m1 236 cbnz syndrome, .Laligned4_done 237 ldr data1, [src1, #-4] 238 ldr data2, [src2, #-4] 239 uadd8 syndrome, data1, const_m1 240 eor syndrome, data1, data2 241 sel syndrome, syndrome, const_m1 242 cmp syndrome, #0 243 beq .Lloop_aligned4 244 245.Laligned4_done: 246 strcmp_epilogue_aligned syndrome, data1, data2, 0 247 248.Lmutual_align4: 249 .cfi_restore_state 250 /* Deal with mutual misalignment by aligning downwards and then 251 masking off the unwanted loaded data to prevent a difference. */ 252 lsl tmp1, tmp1, #3 /* Bytes -> bits. */ 253 bic src1, src1, #3 254 ldr data1, [src1], #8 255 bic src2, src2, #3 256 ldr data2, [src2], #8 257 258 /* In thumb code we can't use MVN with a register shift, but 259 we do have ORN. */ 260 S2HI tmp1, const_m1, tmp1 261 orn data1, data1, tmp1 262 orn data2, data2, tmp1 263 b .Lstart_realigned4 264 265.Lmisaligned4: 266 ands tmp1, src1, #3 267 beq .Lsrc1_aligned 268 sub src2, src2, tmp1 269 bic src1, src1, #3 270 lsls tmp1, tmp1, #31 271 ldr data1, [src1], #4 272 beq .Laligned_m2 273 bcs .Laligned_m1 274 275#ifdef STRCMP_NO_PRECHECK 276 ldrb data2, [src2, #1] 277 uxtb tmp1, data1, ror #BYTE1_OFFSET 278 subs tmp1, tmp1, data2 279 bne .Lmisaligned_exit 280 cbz data2, .Lmisaligned_exit 281 282.Laligned_m2: 283 ldrb data2, [src2, #2] 284 uxtb tmp1, data1, ror #BYTE2_OFFSET 285 subs tmp1, tmp1, data2 286 bne .Lmisaligned_exit 287 cbz data2, .Lmisaligned_exit 288 289.Laligned_m1: 290 ldrb data2, [src2, #3] 291 uxtb tmp1, data1, ror #BYTE3_OFFSET 292 subs tmp1, tmp1, data2 293 bne .Lmisaligned_exit 294 add src2, src2, #4 295 cbnz data2, .Lsrc1_aligned 296#else /* STRCMP_NO_PRECHECK */ 297 /* If we've done the pre-check, then we don't need to check the 298 first byte again here. */ 299 ldrb data2, [src2, #2] 300 uxtb tmp1, data1, ror #BYTE2_OFFSET 301 subs tmp1, tmp1, data2 302 bne .Lmisaligned_exit 303 cbz data2, .Lmisaligned_exit 304 305.Laligned_m2: 306 ldrb data2, [src2, #3] 307 uxtb tmp1, data1, ror #BYTE3_OFFSET 308 subs tmp1, tmp1, data2 309 bne .Lmisaligned_exit 310 cbnz data2, .Laligned_m1 311#endif 312 313.Lmisaligned_exit: 314 .cfi_remember_state 315 mov result, tmp1 316 ldr r4, [sp], #16 317 .cfi_restore 4 318 .cfi_adjust_cfa_offset -16 319 epilogue push_ip=HAVE_PAC_LEAF 320 321#ifndef STRCMP_NO_PRECHECK 322.Lfastpath_exit: 323 .cfi_restore_state 324 .cfi_remember_state 325 sub r0, r2, r3 326 epilogue push_ip=HAVE_PAC_LEAF 327 328.Laligned_m1: 329 .cfi_restore_state 330 .cfi_remember_state 331 add src2, src2, #4 332#endif 333.Lsrc1_aligned: 334 .cfi_restore_state 335 /* src1 is word aligned, but src2 has no common alignment 336 with it. */ 337 ldr data1, [src1], #4 338 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ 339 340 bic src2, src2, #3 341 ldr data2, [src2], #4 342 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ 343 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ 344 345 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ 346.Loverlap3: 347 bic tmp1, data1, #MSB 348 uadd8 syndrome, data1, const_m1 349 eors syndrome, tmp1, data2, S2LO #8 350 sel syndrome, syndrome, const_m1 351 bne 4f 352 cbnz syndrome, 5f 353 ldr data2, [src2], #4 354 eor tmp1, tmp1, data1 355 cmp tmp1, data2, S2HI #24 356 bne 6f 357 ldr data1, [src1], #4 358 b .Loverlap3 3594: 360 S2LO data2, data2, #8 361 b .Lstrcmp_tail 362 3635: 364 bics syndrome, syndrome, #MSB 365 bne .Lstrcmp_done_equal 366 367 /* We can only get here if the MSB of data1 contains 0, so 368 fast-path the exit. */ 369 ldrb result, [src2] 370 .cfi_remember_state 371 ldrd r4, r5, [sp], #16 372 .cfi_restore 4 373 .cfi_restore 5 374 /* R6/7 Not used in this sequence. */ 375 .cfi_restore 6 376 .cfi_restore 7 377 .cfi_adjust_cfa_offset -16 378 neg result, result 379 epilogue push_ip=HAVE_PAC_LEAF 380 3816: 382 .cfi_restore_state 383 S2LO data1, data1, #24 384 and data2, data2, #LSB 385 b .Lstrcmp_tail 386 387 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 388.Loverlap2: 389 and tmp1, data1, const_m1, S2LO #16 390 uadd8 syndrome, data1, const_m1 391 eors syndrome, tmp1, data2, S2LO #16 392 sel syndrome, syndrome, const_m1 393 bne 4f 394 cbnz syndrome, 5f 395 ldr data2, [src2], #4 396 eor tmp1, tmp1, data1 397 cmp tmp1, data2, S2HI #16 398 bne 6f 399 ldr data1, [src1], #4 400 b .Loverlap2 4014: 402 S2LO data2, data2, #16 403 b .Lstrcmp_tail 4045: 405 ands syndrome, syndrome, const_m1, S2LO #16 406 bne .Lstrcmp_done_equal 407 408 ldrh data2, [src2] 409 S2LO data1, data1, #16 410#ifdef __ARM_BIG_ENDIAN 411 lsl data2, data2, #16 412#endif 413 b .Lstrcmp_tail 414 4156: 416 S2LO data1, data1, #16 417 and data2, data2, const_m1, S2LO #16 418 b .Lstrcmp_tail 419 420 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 421.Loverlap1: 422 and tmp1, data1, #LSB 423 uadd8 syndrome, data1, const_m1 424 eors syndrome, tmp1, data2, S2LO #24 425 sel syndrome, syndrome, const_m1 426 bne 4f 427 cbnz syndrome, 5f 428 ldr data2, [src2], #4 429 eor tmp1, tmp1, data1 430 cmp tmp1, data2, S2HI #8 431 bne 6f 432 ldr data1, [src1], #4 433 b .Loverlap1 4344: 435 S2LO data2, data2, #24 436 b .Lstrcmp_tail 4375: 438 tst syndrome, #LSB 439 bne .Lstrcmp_done_equal 440 ldr data2, [src2] 4416: 442 S2LO data1, data1, #8 443 bic data2, data2, #MSB 444 b .Lstrcmp_tail 445 446.Lstrcmp_done_equal: 447 mov result, #0 448 .cfi_remember_state 449 ldrd r4, r5, [sp], #16 450 .cfi_restore 4 451 .cfi_restore 5 452 /* R6/7 not used in this sequence. */ 453 .cfi_restore 6 454 .cfi_restore 7 455 .cfi_adjust_cfa_offset -16 456 epilogue push_ip=HAVE_PAC_LEAF 457 458.Lstrcmp_tail: 459 .cfi_restore_state 460#ifndef __ARM_BIG_ENDIAN 461 rev data1, data1 462 rev data2, data2 463 /* Now everything looks big-endian... */ 464#endif 465 uadd8 tmp1, data1, const_m1 466 eor tmp1, data1, data2 467 sel syndrome, tmp1, const_m1 468 clz tmp1, syndrome 469 lsl data1, data1, tmp1 470 lsl data2, data2, tmp1 471 lsr result, data1, #24 472 ldrd r4, r5, [sp], #16 473 .cfi_restore 4 474 .cfi_restore 5 475 /* R6/7 not used in this sequence. */ 476 .cfi_restore 6 477 .cfi_restore 7 478 .cfi_adjust_cfa_offset -16 479 sub result, result, data2, lsr #24 480 epilogue push_ip=HAVE_PAC_LEAF 481 .cfi_endproc 482 .cantunwind 483 .fnend 484 .size strcmp, . - strcmp 485