1/* 2 * memchr - find a character in a memory zone 3 * 4 * Copyright (c) 2014, ARM Limited 5 * All rights Reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * * Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * * Neither the name of the company nor the names of its contributors 15 * may be used to endorse or promote products derived from this 16 * software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 32/* See memchr-stub.c */ 33#else 34/* Assumptions: 35 * 36 * ARMv8-a, AArch64 37 * Neon Available. 38 */ 39 40/* Arguments and results. */ 41#define srcin x0 42#define chrin w1 43#define cntin x2 44 45#define result x0 46 47#define src x3 48#define tmp x4 49#define wtmp2 w5 50#define synd x6 51#define soff x9 52#define cntrem x10 53 54#define vrepchr v0 55#define vdata1 v1 56#define vdata2 v2 57#define vhas_chr1 v3 58#define vhas_chr2 v4 59#define vrepmask v5 60#define vend v6 61 62/* 63 * Core algorithm: 64 * 65 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits 66 * per byte. For each tuple, bit 0 is set if the relevant byte matched the 67 * requested character and bit 1 is not used (faster than using a 32bit 68 * syndrome). Since the bits in the syndrome reflect exactly the order in which 69 * things occur in the original string, counting trailing zeros allows to 70 * identify exactly which byte has matched. 71 */ 72 73 .macro def_fn f p2align=0 74 .text 75 .p2align \p2align 76 .global \f 77 .type \f, %function 78\f: 79 .endm 80 81def_fn memchr 82 /* Do not dereference srcin if no bytes to compare. */ 83 cbz cntin, .Lzero_length 84 /* 85 * Magic constant 0x40100401 allows us to identify which lane matches 86 * the requested byte. 87 */ 88 mov wtmp2, #0x0401 89 movk wtmp2, #0x4010, lsl #16 90 dup vrepchr.16b, chrin 91 /* Work with aligned 32-byte chunks */ 92 bic src, srcin, #31 93 dup vrepmask.4s, wtmp2 94 ands soff, srcin, #31 95 and cntrem, cntin, #31 96 b.eq .Lloop 97 98 /* 99 * Input string is not 32-byte aligned. We calculate the syndrome 100 * value for the aligned 32 bytes block containing the first bytes 101 * and mask the irrelevant part. 102 */ 103 104 ld1 {vdata1.16b, vdata2.16b}, [src], #32 105 sub tmp, soff, #32 106 adds cntin, cntin, tmp 107 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 108 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 109 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 110 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 111 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 112 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 113 mov synd, vend.d[0] 114 /* Clear the soff*2 lower bits */ 115 lsl tmp, soff, #1 116 lsr synd, synd, tmp 117 lsl synd, synd, tmp 118 /* The first block can also be the last */ 119 b.ls .Lmasklast 120 /* Have we found something already? */ 121 cbnz synd, .Ltail 122 123.Lloop: 124 ld1 {vdata1.16b, vdata2.16b}, [src], #32 125 subs cntin, cntin, #32 126 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 127 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 128 /* If we're out of data we finish regardless of the result */ 129 b.ls .Lend 130 /* Use a fast check for the termination condition */ 131 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b 132 addp vend.2d, vend.2d, vend.2d 133 mov synd, vend.d[0] 134 /* We're not out of data, loop if we haven't found the character */ 135 cbz synd, .Lloop 136 137.Lend: 138 /* Termination condition found, let's calculate the syndrome value */ 139 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 140 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 141 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 142 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 143 mov synd, vend.d[0] 144 /* Only do the clear for the last possible block */ 145 b.hi .Ltail 146 147.Lmasklast: 148 /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ 149 add tmp, cntrem, soff 150 and tmp, tmp, #31 151 sub tmp, tmp, #32 152 neg tmp, tmp, lsl #1 153 lsl synd, synd, tmp 154 lsr synd, synd, tmp 155 156.Ltail: 157 /* Count the trailing zeros using bit reversing */ 158 rbit synd, synd 159 /* Compensate the last post-increment */ 160 sub src, src, #32 161 /* Check that we have found a character */ 162 cmp synd, #0 163 /* And count the leading zeros */ 164 clz synd, synd 165 /* Compute the potential result */ 166 add result, src, synd, lsr #1 167 /* Select result or NULL */ 168 csel result, xzr, result, eq 169 ret 170 171.Lzero_length: 172 mov result, #0 173 ret 174 175 .size memchr, . - memchr 176#endif 177