1/* Copyright (c) 2010-2011, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 * Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 * Neither the name of Linaro Limited nor the names of its 16 contributors may be used to endorse or promote products derived 17 from this software without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 Written by Dave Gilbert <david.gilbert@linaro.org> 32 33 This memchr routine is optimised on a Cortex-A9 and should work on 34 all ARMv7 processors. It has a fast path for short sizes, and has 35 an optimised path for large data sets; the worst case is finding the 36 match early in a large data set. */ 37 38/* Copyright (c) 2015 ARM Ltd. 39 All rights reserved. 40 41 Redistribution and use in source and binary forms, with or without 42 modification, are permitted provided that the following conditions are met: 43 * Redistributions of source code must retain the above copyright 44 notice, this list of conditions and the following disclaimer. 45 * Redistributions in binary form must reproduce the above copyright 46 notice, this list of conditions and the following disclaimer in the 47 documentation and/or other materials provided with the distribution. 48 * Neither the name of the Linaro nor the 49 names of its contributors may be used to endorse or promote products 50 derived from this software without specific prior written permission. 51 52 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 53 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 54 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 55 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 56 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 57 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 58 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 59 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 60 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 62 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 63 64@ 2011-02-07 david.gilbert@linaro.org 65@ Extracted from local git a5b438d861 66@ 2011-07-14 david.gilbert@linaro.org 67@ Import endianness fix from local git ea786f1b 68@ 2011-10-11 david.gilbert@linaro.org 69@ Import from cortex-strings bzr rev 63 70@ Flip to ldrd (as suggested by Greta Yorsh) 71@ Make conditional on CPU type 72@ tidy 73 74@ This code requires armv6t2 or later. Uses Thumb2. 75 76 .syntax unified 77 78#include "arm_asm.h" 79 80@ NOTE: This ifdef MUST match the one in memchr-stub.c 81#if defined (__ARM_NEON__) || defined (__ARM_NEON) 82#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'R' 83 .arch armv8-r 84#else 85 .arch armv7-a 86#endif 87 .fpu neon 88 89 90/* Arguments */ 91#define srcin r0 92#define chrin r1 93#define cntin r2 94 95/* Retval */ 96#define result r0 /* Live range does not overlap with srcin */ 97 98/* Working registers */ 99#define src r1 /* Live range does not overlap with chrin */ 100#define tmp r3 101#define synd r0 /* No overlap with srcin or result */ 102#define soff r12 103 104/* Working NEON registers */ 105#define vrepchr q0 106#define vdata0 q1 107#define vdata0_0 d2 /* Lower half of vdata0 */ 108#define vdata0_1 d3 /* Upper half of vdata0 */ 109#define vdata1 q2 110#define vdata1_0 d4 /* Lower half of vhas_chr0 */ 111#define vdata1_1 d5 /* Upper half of vhas_chr0 */ 112#define vrepmask q3 113#define vrepmask0 d6 114#define vrepmask1 d7 115#define vend q4 116#define vend0 d8 117#define vend1 d9 118 119/* 120 * Core algorithm: 121 * 122 * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per 123 * byte. Each bit is set if the relevant byte matched the requested character 124 * and cleared otherwise. Since the bits in the syndrome reflect exactly the 125 * order in which things occur in the original string, counting trailing zeros 126 * allows to identify exactly which byte has matched. 127 */ 128 129 .text 130 .thumb_func 131 .align 4 132 .p2align 4,,15 133 .global memchr 134 .type memchr,%function 135 136memchr: 137 .cfi_sections .debug_frame 138 .cfi_startproc 139 /* Use a simple loop if there are less than 8 bytes to search. */ 140 cmp cntin, #7 141 bhi .Llargestr 142 and chrin, chrin, #0xff 143 144.Lsmallstr: 145 subs cntin, cntin, #1 146 blo .Lnotfound /* Return not found if reached end. */ 147 ldrb tmp, [srcin], #1 148 cmp tmp, chrin 149 bne .Lsmallstr /* Loop again if not found. */ 150 /* Otherwise fixup address and return. */ 151 sub result, result, #1 152 bx lr 153 154 155.Llargestr: 156 vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ 157 /* 158 * Magic constant 0x8040201008040201 allows us to identify which lane 159 * matches the requested byte. 160 */ 161 movw tmp, #0x0201 162 movt tmp, #0x0804 163 lsl soff, tmp, #4 164 vmov vrepmask0, tmp, soff 165 vmov vrepmask1, tmp, soff 166 /* Work with aligned 32-byte chunks */ 167 bic src, srcin, #31 168 ands soff, srcin, #31 169 beq .Lloopintro /* Go straight to main loop if it's aligned. */ 170 171 /* 172 * Input string is not 32-byte aligned. We calculate the syndrome 173 * value for the aligned 32 bytes block containing the first bytes 174 * and mask the irrelevant part. 175 */ 176 vld1.8 {vdata0, vdata1}, [src:256]! 177 sub tmp, soff, #32 178 adds cntin, cntin, tmp 179 vceq.i8 vdata0, vdata0, vrepchr 180 vceq.i8 vdata1, vdata1, vrepchr 181 vand vdata0, vdata0, vrepmask 182 vand vdata1, vdata1, vrepmask 183 vpadd.i8 vdata0_0, vdata0_0, vdata0_1 184 vpadd.i8 vdata1_0, vdata1_0, vdata1_1 185 vpadd.i8 vdata0_0, vdata0_0, vdata1_0 186 vpadd.i8 vdata0_0, vdata0_0, vdata0_0 187 vmov synd, vdata0_0[0] 188 189 /* Clear the soff lower bits */ 190 lsr synd, synd, soff 191 lsl synd, synd, soff 192 /* The first block can also be the last */ 193 bls .Lmasklast 194 /* Have we found something already? */ 195 cbnz synd, .Ltail 196 197 198.Lloopintro: 199 vpush {vend} 200 /* 264/265 correspond to d8/d9 for q4 */ 201 .cfi_adjust_cfa_offset 16 202 .cfi_rel_offset 264, 0 203 .cfi_rel_offset 265, 8 204 .p2align 3,,7 205.Lloop: 206 vld1.8 {vdata0, vdata1}, [src:256]! 207 subs cntin, cntin, #32 208 vceq.i8 vdata0, vdata0, vrepchr 209 vceq.i8 vdata1, vdata1, vrepchr 210 /* If we're out of data we finish regardless of the result. */ 211 bls .Lend 212 /* Use a fast check for the termination condition. */ 213 vorr vend, vdata0, vdata1 214 vorr vend0, vend0, vend1 215 vmov synd, tmp, vend0 216 orrs synd, synd, tmp 217 /* We're not out of data, loop if we haven't found the character. */ 218 beq .Lloop 219 220.Lend: 221 vpop {vend} 222 .cfi_adjust_cfa_offset -16 223 .cfi_restore 264 224 .cfi_restore 265 225 226 /* Termination condition found, let's calculate the syndrome value. */ 227 vand vdata0, vdata0, vrepmask 228 vand vdata1, vdata1, vrepmask 229 vpadd.i8 vdata0_0, vdata0_0, vdata0_1 230 vpadd.i8 vdata1_0, vdata1_0, vdata1_1 231 vpadd.i8 vdata0_0, vdata0_0, vdata1_0 232 vpadd.i8 vdata0_0, vdata0_0, vdata0_0 233 vmov synd, vdata0_0[0] 234 cbz synd, .Lnotfound 235 bhi .Ltail 236 237 238.Lmasklast: 239 /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ 240 neg cntin, cntin 241 lsl synd, synd, cntin 242 lsrs synd, synd, cntin 243 it eq 244 moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ 245 246 247.Ltail: 248 /* Count the trailing zeros using bit reversing */ 249 rbit synd, synd 250 /* Compensate the last post-increment */ 251 sub src, src, #32 252 /* Count the leading zeros */ 253 clz synd, synd 254 /* Compute the potential result and return */ 255 add result, src, synd 256 bx lr 257 258 259.Lnotfound: 260 /* Set result to NULL if not found and return */ 261 mov result, #0 262 bx lr 263 264 .cfi_endproc 265 .size memchr, . - memchr 266 267#elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP) 268 269#if __ARM_ARCH_PROFILE == 'M' 270#if __ARM_ARCH >= 8 271 /* keep config inherited from -march=. */ 272#else 273 .arch armv7e-m 274#endif /* __ARM_ARCH >= 8 */ 275#else 276 .arch armv6t2 277#endif /* __ARM_ARCH_PROFILE == 'M' */ 278 279@ this lets us check a flag in a 00/ff byte easily in either endianness 280#ifdef __ARMEB__ 281#define CHARTSTMASK(c) 1<<(31-(c*8)) 282#else 283#define CHARTSTMASK(c) 1<<(c*8) 284#endif 285 .text 286 .thumb 287 288@ --------------------------------------------------------------------------- 289 .thumb_func 290 .align 2 291 .p2align 4,,15 292 .global memchr 293 .type memchr,%function 294 .fnstart 295 .cfi_startproc 296memchr: 297 @ r0 = start of memory to scan 298 @ r1 = character to look for 299 @ r2 = length 300 @ returns r0 = pointer to character or NULL if not found 301 prologue 302 and r1,r1,#0xff @ Don't trust the caller to pass a char 303 304 cmp r2,#16 @ If short don't bother with anything clever 305 blt 20f 306 307 tst r0, #7 @ If it's already aligned skip the next bit 308 beq 10f 309 310 @ Work up to an aligned point 3115: 312 ldrb r3, [r0],#1 313 subs r2, r2, #1 314 cmp r3, r1 315 beq 50f @ If it matches exit found 316 tst r0, #7 317 cbz r2, 40f @ If we run off the end, exit not found 318 bne 5b @ If not aligned yet then do next byte 319 32010: 321 @ We are aligned, we know we have at least 8 bytes to work with 322 push {r4,r5,r6,r7} 323 .cfi_adjust_cfa_offset 16 324 .cfi_rel_offset 4, 0 325 .cfi_rel_offset 5, 4 326 .cfi_rel_offset 6, 8 327 .cfi_rel_offset 7, 12 328 orr r1, r1, r1, lsl #8 @ expand the match word across all bytes 329 orr r1, r1, r1, lsl #16 330 bic r4, r2, #7 @ Number of double words to work with * 8 331 mvns r7, #0 @ all F's 332 movs r3, #0 333 33415: 335 ldrd r5,r6,[r0],#8 336 subs r4, r4, #8 337 eor r5,r5, r1 @ r5,r6 have 00's where bytes match the target 338 eor r6,r6, r1 339 uadd8 r5, r5, r7 @ Par add 0xff - sets GE bits for bytes!=0 340 sel r5, r3, r7 @ bytes are 00 for none-00 bytes, 341 @ or ff for 00 bytes - NOTE INVERSION 342 uadd8 r6, r6, r7 @ Par add 0xff - sets GE bits for bytes!=0 343 sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes 344 @ or ff for 00 bytes - NOTE INVERSION 345 cbnz r6, 60f 346 bne 15b @ (Flags from the subs above) 347 348 pop {r4,r5,r6,r7} 349 .cfi_restore 7 350 .cfi_restore 6 351 .cfi_restore 5 352 .cfi_restore 4 353 .cfi_adjust_cfa_offset -16 354 and r1,r1,#0xff @ r1 back to a single character 355 and r2,r2,#7 @ Leave the count remaining as the number 356 @ after the double words have been done 357 35820: 359 cbz r2, 40f @ 0 length or hit the end already then not found 360 36121: @ Post aligned section, or just a short call 362 ldrb r3,[r0],#1 363 subs r2,r2,#1 364 eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub 365 cbz r3, 50f 366 bne 21b @ on r2 flags 367 36840: 369 .cfi_remember_state 370 movs r0,#0 @ not found 371 epilogue 372 37350: 374 .cfi_restore_state 375 .cfi_remember_state 376 subs r0,r0,#1 @ found 377 epilogue 378 37960: @ We're here because the fast path found a hit 380 @ now we have to track down exactly which word it was 381 @ r0 points to the start of the double word after the one tested 382 @ r5 has the 00/ff pattern for the first word, r6 has the chained value 383 @ This point is reached from cbnz midway through label 15 prior to 384 @ popping r4-r7 off the stack. .cfi_restore_state alone disregards 385 @ this, so we manually correct this. 386 .cfi_restore_state @ Standard post-prologue state 387 .cfi_adjust_cfa_offset 16 388 .cfi_rel_offset 4, 0 389 .cfi_rel_offset 5, 4 390 .cfi_rel_offset 6, 8 391 .cfi_rel_offset 7, 12 392 cmp r5, #0 393 itte eq 394 moveq r5, r6 @ the end is in the 2nd word 395 subeq r0,r0,#3 @ Points to 2nd byte of 2nd word 396 subne r0,r0,#7 @ or 2nd byte of 1st word 397 398 @ r0 currently points to the 2nd byte of the word containing the hit 399 tst r5, # CHARTSTMASK(0) @ 1st character 400 bne 61f 401 adds r0,r0,#1 402 tst r5, # CHARTSTMASK(1) @ 2nd character 403 ittt eq 404 addeq r0,r0,#1 405 tsteq r5, # (3<<15) @ 2nd & 3rd character 406 @ If not the 3rd must be the last one 407 addeq r0,r0,#1 408 40961: 410 pop {r4,r5,r6,r7} 411 .cfi_restore 7 412 .cfi_restore 6 413 .cfi_restore 5 414 .cfi_restore 4 415 .cfi_adjust_cfa_offset -16 416 subs r0,r0,#1 417 epilogue 418 .cfi_endproc 419 .cantunwind 420 .fnend 421#else 422 /* Defined in memchr.c. */ 423#endif 424