1/* Copyright (c) 2010-2011, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 * Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 * Neither the name of Linaro Limited nor the names of its 16 contributors may be used to endorse or promote products derived 17 from this software without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 Written by Dave Gilbert <david.gilbert@linaro.org> 32 33 This memchr routine is optimised on a Cortex-A9 and should work on 34 all ARMv7 processors. It has a fast path for short sizes, and has 35 an optimised path for large data sets; the worst case is finding the 36 match early in a large data set. */ 37 38/* Copyright (c) 2015 ARM Ltd. 39 All rights reserved. 40 41 Redistribution and use in source and binary forms, with or without 42 modification, are permitted provided that the following conditions are met: 43 * Redistributions of source code must retain the above copyright 44 notice, this list of conditions and the following disclaimer. 45 * Redistributions in binary form must reproduce the above copyright 46 notice, this list of conditions and the following disclaimer in the 47 documentation and/or other materials provided with the distribution. 48 * Neither the name of the Linaro nor the 49 names of its contributors may be used to endorse or promote products 50 derived from this software without specific prior written permission. 51 52 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 53 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 54 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 55 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 56 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 57 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 58 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 59 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 60 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 62 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 63 64// 2011-02-07 david.gilbert@linaro.org 65// Extracted from local git a5b438d861 66// 2011-07-14 david.gilbert@linaro.org 67// Import endianness fix from local git ea786f1b 68// 2011-10-11 david.gilbert@linaro.org 69// Import from cortex-strings bzr rev 63 70// Flip to ldrd (as suggested by Greta Yorsh) 71// Make conditional on CPU type 72// tidy 73 74// This code requires armv6t2 or later. Uses Thumb2. 75 76#include <picolibc.h> 77 78 .syntax unified 79 80#include "arm_asm.h" 81 82// NOTE: This ifdef MUST match the one in memchr-stub.c 83#if defined (__ARM_NEON__) || defined (__ARM_NEON) 84#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'R' 85 .arch armv8-r 86#else 87 .arch armv7-a 88#endif 89 .fpu neon 90 91 92/* Arguments */ 93#define srcin r0 94#define chrin r1 95#define cntin r2 96 97/* Retval */ 98#define result r0 /* Live range does not overlap with srcin */ 99 100/* Working registers */ 101#define src r1 /* Live range does not overlap with chrin */ 102#define tmp r3 103#define synd r0 /* No overlap with srcin or result */ 104#define soff r12 105 106/* Working NEON registers */ 107#define vrepchr q0 108#define vdata0 q1 109#define vdata0_0 d2 /* Lower half of vdata0 */ 110#define vdata0_1 d3 /* Upper half of vdata0 */ 111#define vdata1 q2 112#define vdata1_0 d4 /* Lower half of vhas_chr0 */ 113#define vdata1_1 d5 /* Upper half of vhas_chr0 */ 114#define vrepmask q3 115#define vrepmask0 d6 116#define vrepmask1 d7 117#define vend q4 118#define vend0 d8 119#define vend1 d9 120 121/* 122 * Core algorithm: 123 * 124 * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per 125 * byte. Each bit is set if the relevant byte matched the requested character 126 * and cleared otherwise. Since the bits in the syndrome reflect exactly the 127 * order in which things occur in the original string, counting trailing zeros 128 * allows to identify exactly which byte has matched. 129 */ 130 131 .text 132 .thumb_func 133 .align 4 134 .p2align 4,,15 135 .global memchr 136 .type memchr,%function 137 138memchr: 139 .cfi_sections .debug_frame 140 .cfi_startproc 141 /* Use a simple loop if there are less than 8 bytes to search. */ 142 cmp cntin, #7 143 bhi .Llargestr 144 and chrin, chrin, #0xff 145 146.Lsmallstr: 147 subs cntin, cntin, #1 148 blo .Lnotfound /* Return not found if reached end. */ 149 ldrb tmp, [srcin], #1 150 cmp tmp, chrin 151 bne .Lsmallstr /* Loop again if not found. */ 152 /* Otherwise fixup address and return. */ 153 sub result, result, #1 154 bx lr 155 156 157.Llargestr: 158 vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ 159 /* 160 * Magic constant 0x8040201008040201 allows us to identify which lane 161 * matches the requested byte. 162 */ 163 movw tmp, #0x0201 164 movt tmp, #0x0804 165 lsl soff, tmp, #4 166 vmov vrepmask0, tmp, soff 167 vmov vrepmask1, tmp, soff 168 /* Work with aligned 32-byte chunks */ 169 bic src, srcin, #31 170 ands soff, srcin, #31 171 beq .Lloopintro /* Go straight to main loop if it's aligned. */ 172 173 /* 174 * Input string is not 32-byte aligned. We calculate the syndrome 175 * value for the aligned 32 bytes block containing the first bytes 176 * and mask the irrelevant part. 177 */ 178 vld1.8 {vdata0, vdata1}, [src:256]! 179 sub tmp, soff, #32 180 adds cntin, cntin, tmp 181 vceq.i8 vdata0, vdata0, vrepchr 182 vceq.i8 vdata1, vdata1, vrepchr 183 vand vdata0, vdata0, vrepmask 184 vand vdata1, vdata1, vrepmask 185 vpadd.i8 vdata0_0, vdata0_0, vdata0_1 186 vpadd.i8 vdata1_0, vdata1_0, vdata1_1 187 vpadd.i8 vdata0_0, vdata0_0, vdata1_0 188 vpadd.i8 vdata0_0, vdata0_0, vdata0_0 189 vmov synd, vdata0_0[0] 190 191 /* Clear the soff lower bits */ 192 lsr synd, synd, soff 193 lsl synd, synd, soff 194 /* The first block can also be the last */ 195 bls .Lmasklast 196 /* Have we found something already? */ 197 cbnz synd, .Ltail 198 199 200.Lloopintro: 201 vpush {vend} 202 /* 264/265 correspond to d8/d9 for q4 */ 203 .cfi_adjust_cfa_offset 16 204 .cfi_rel_offset 264, 0 205 .cfi_rel_offset 265, 8 206 .p2align 3,,7 207.Lloop: 208 vld1.8 {vdata0, vdata1}, [src:256]! 209 subs cntin, cntin, #32 210 vceq.i8 vdata0, vdata0, vrepchr 211 vceq.i8 vdata1, vdata1, vrepchr 212 /* If we're out of data we finish regardless of the result. */ 213 bls .Lend 214 /* Use a fast check for the termination condition. */ 215 vorr vend, vdata0, vdata1 216 vorr vend0, vend0, vend1 217 vmov synd, tmp, vend0 218 orrs synd, synd, tmp 219 /* We're not out of data, loop if we haven't found the character. */ 220 beq .Lloop 221 222.Lend: 223 vpop {vend} 224 .cfi_adjust_cfa_offset -16 225 .cfi_restore 264 226 .cfi_restore 265 227 228 /* Termination condition found, let's calculate the syndrome value. */ 229 vand vdata0, vdata0, vrepmask 230 vand vdata1, vdata1, vrepmask 231 vpadd.i8 vdata0_0, vdata0_0, vdata0_1 232 vpadd.i8 vdata1_0, vdata1_0, vdata1_1 233 vpadd.i8 vdata0_0, vdata0_0, vdata1_0 234 vpadd.i8 vdata0_0, vdata0_0, vdata0_0 235 vmov synd, vdata0_0[0] 236 cbz synd, .Lnotfound 237 bhi .Ltail 238 239 240.Lmasklast: 241 /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ 242 neg cntin, cntin 243 lsl synd, synd, cntin 244 lsrs synd, synd, cntin 245 it eq 246 moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ 247 248 249.Ltail: 250 /* Count the trailing zeros using bit reversing */ 251 rbit synd, synd 252 /* Compensate the last post-increment */ 253 sub src, src, #32 254 /* Count the leading zeros */ 255 clz synd, synd 256 /* Compute the potential result and return */ 257 add result, src, synd 258 bx lr 259 260 261.Lnotfound: 262 /* Set result to NULL if not found and return */ 263 mov result, #0 264 bx lr 265 266 .cfi_endproc 267 .size memchr, . - memchr 268 269#elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP) 270 271#if __ARM_ARCH_PROFILE == 'M' 272#if __ARM_ARCH >= 8 273 /* keep config inherited from -march=. */ 274#else 275 .arch armv7e-m 276#endif /* __ARM_ARCH >= 8 */ 277#else 278 .arch armv6t2 279#endif /* __ARM_ARCH_PROFILE == 'M' */ 280 281// this lets us check a flag in a 00/ff byte easily in either endianness 282#ifdef __ARMEB__ 283#define CHARTSTMASK(c) 1<<(31-(c*8)) 284#else 285#define CHARTSTMASK(c) 1<<(c*8) 286#endif 287 .text 288 .thumb 289 290// --------------------------------------------------------------------------- 291 .thumb_func 292 .align 2 293 .p2align 4,,15 294 .global memchr 295 .type memchr,%function 296 .fnstart 297 .cfi_sections .debug_frame 298 .cfi_startproc 299memchr: 300 // r0 = start of memory to scan 301 // r1 = character to look for 302 // r2 = length 303 // returns r0 = pointer to character or NULL if not found 304 prologue 305 and r1,r1,#0xff // Don't trust the caller to pass a char 306 307 cmp r2,#16 // If short don't bother with anything clever 308 blt 20f 309 310 tst r0, #7 // If it's already aligned skip the next bit 311 beq 10f 312 313 // Work up to an aligned point 3145: 315 ldrb r3, [r0],#1 316 subs r2, r2, #1 317 cmp r3, r1 318 beq 50f // If it matches exit found 319 tst r0, #7 320 cbz r2, 40f // If we run off the end, exit not found 321 bne 5b // If not aligned yet then do next byte 322 32310: 324 // We are aligned, we know we have at least 8 bytes to work with 325 push {r4,r5,r6,r7} 326 .cfi_adjust_cfa_offset 16 327 .cfi_rel_offset 4, 0 328 .cfi_rel_offset 5, 4 329 .cfi_rel_offset 6, 8 330 .cfi_rel_offset 7, 12 331 orr r1, r1, r1, lsl #8 // expand the match word across all bytes 332 orr r1, r1, r1, lsl #16 333 bic r4, r2, #7 // Number of double words to work with * 8 334 mvns r7, #0 // all F's 335 movs r3, #0 336 33715: 338 ldrd r5,r6,[r0],#8 339 subs r4, r4, #8 340 eor r5,r5, r1 // r5,r6 have 00's where bytes match the target 341 eor r6,r6, r1 342 uadd8 r5, r5, r7 // Par add 0xff - sets GE bits for bytes!=0 343 sel r5, r3, r7 // bytes are 00 for none-00 bytes, 344 // or ff for 00 bytes - NOTE INVERSION 345 uadd8 r6, r6, r7 // Par add 0xff - sets GE bits for bytes!=0 346 sel r6, r5, r7 // chained....bytes are 00 for none-00 bytes 347 // or ff for 00 bytes - NOTE INVERSION 348 cbnz r6, 60f 349 bne 15b // (Flags from the subs above) 350 351 pop {r4,r5,r6,r7} 352 .cfi_restore 7 353 .cfi_restore 6 354 .cfi_restore 5 355 .cfi_restore 4 356 .cfi_adjust_cfa_offset -16 357 and r1,r1,#0xff // r1 back to a single character 358 and r2,r2,#7 // Leave the count remaining as the number 359 // after the double words have been done 360 36120: 362 cbz r2, 40f // 0 length or hit the end already then not found 363 36421: // Post aligned section, or just a short call 365 ldrb r3,[r0],#1 366 subs r2,r2,#1 367 eor r3,r3,r1 // r3 = 0 if match - doesn't break flags from sub 368 cbz r3, 50f 369 bne 21b // on r2 flags 370 37140: 372 .cfi_remember_state 373 movs r0,#0 // not found 374 epilogue 375 37650: 377 .cfi_restore_state 378 .cfi_remember_state 379 subs r0,r0,#1 // found 380 epilogue 381 38260: // We're here because the fast path found a hit 383 // now we have to track down exactly which word it was 384 // r0 points to the start of the double word after the one tested 385 // r5 has the 00/ff pattern for the first word, r6 has the chained value 386 // This point is reached from cbnz midway through label 15 prior to 387 // popping r4-r7 off the stack. .cfi_restore_state alone disregards 388 // this, so we manually correct this. 389 .cfi_restore_state // Standard post-prologue state 390 .cfi_adjust_cfa_offset 16 391 .cfi_rel_offset 4, 0 392 .cfi_rel_offset 5, 4 393 .cfi_rel_offset 6, 8 394 .cfi_rel_offset 7, 12 395 cmp r5, #0 396 itte eq 397 moveq r5, r6 // the end is in the 2nd word 398 subeq r0,r0,#3 // Points to 2nd byte of 2nd word 399 subne r0,r0,#7 // or 2nd byte of 1st word 400 401 // r0 currently points to the 2nd byte of the word containing the hit 402 tst r5, # CHARTSTMASK(0) // 1st character 403 bne 61f 404 adds r0,r0,#1 405 tst r5, # CHARTSTMASK(1) // 2nd character 406 ittt eq 407 addeq r0,r0,#1 408 tsteq r5, # (3<<15) // 2nd & 3rd character 409 // If not the 3rd must be the last one 410 addeq r0,r0,#1 411 41261: 413 pop {r4,r5,r6,r7} 414 .cfi_restore 7 415 .cfi_restore 6 416 .cfi_restore 5 417 .cfi_restore 4 418 .cfi_adjust_cfa_offset -16 419 subs r0,r0,#1 420 epilogue 421 .cfi_endproc 422 .cantunwind 423 .fnend 424#else 425 /* Defined in memchr.c. */ 426#endif 427