1/* 2 strchrnul - find a character or nul in a string 3 4 Copyright (c) 2014, ARM Limited 5 All rights Reserved. 6 7 Redistribution and use in source and binary forms, with or without 8 modification, are permitted provided that the following conditions are met: 9 * Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 * Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 * Neither the name of the company nor the names of its contributors 15 may be used to endorse or promote products derived from this 16 software without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 29 30#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 31/* See strchrnul-stub.c */ 32#else 33 34/* Assumptions: 35 * 36 * ARMv8-a, AArch64 37 * Neon Available. 38 */ 39 40/* Arguments and results. */ 41#define srcin x0 42#define chrin w1 43 44#define result x0 45 46#define src x2 47#define tmp1 x3 48#define wtmp2 w4 49#define tmp3 x5 50 51#define vrepchr v0 52#define vdata1 v1 53#define vdata2 v2 54#define vhas_nul1 v3 55#define vhas_nul2 v4 56#define vhas_chr1 v5 57#define vhas_chr2 v6 58#define vrepmask v7 59#define vend1 v16 60 61/* Core algorithm. 62 63 For each 32-byte hunk we calculate a 64-bit syndrome value, with 64 two bits per byte (LSB is always in bits 0 and 1, for both big 65 and little-endian systems). For each tuple, bit 0 is set iff 66 the relevant byte matched the requested character or nul. Since the 67 bits in the syndrome reflect exactly the order in which things occur 68 in the original string a count_trailing_zeros() operation will 69 identify exactly which byte is causing the termination. */ 70 71/* Locals and temporaries. */ 72 73 .macro def_fn f p2align=0 74 .text 75 .p2align \p2align 76 .global \f 77 .type \f, %function 78\f: 79 .endm 80 81def_fn strchrnul 82 /* Magic constant 0x40100401 to allow us to identify which lane 83 matches the termination condition. */ 84 mov wtmp2, #0x0401 85 movk wtmp2, #0x4010, lsl #16 86 dup vrepchr.16b, chrin 87 bic src, srcin, #31 /* Work with aligned 32-byte hunks. */ 88 dup vrepmask.4s, wtmp2 89 ands tmp1, srcin, #31 90 b.eq .Lloop 91 92 /* Input string is not 32-byte aligned. Rather than forcing 93 the padding bytes to a safe value, we calculate the syndrome 94 for all the bytes, but then mask off those bits of the 95 syndrome that are related to the padding. */ 96 ld1 {vdata1.16b, vdata2.16b}, [src], #32 97 neg tmp1, tmp1 98 cmeq vhas_nul1.16b, vdata1.16b, #0 99 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 100 cmeq vhas_nul2.16b, vdata2.16b, #0 101 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 102 orr vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b 103 orr vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b 104 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 105 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 106 lsl tmp1, tmp1, #1 107 addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 108 mov tmp3, #~0 109 addp vend1.16b, vend1.16b, vend1.16b // 128->64 110 lsr tmp1, tmp3, tmp1 111 112 mov tmp3, vend1.d[0] 113 bic tmp1, tmp3, tmp1 // Mask padding bits. 114 cbnz tmp1, .Ltail 115 116.Lloop: 117 ld1 {vdata1.16b, vdata2.16b}, [src], #32 118 cmeq vhas_nul1.16b, vdata1.16b, #0 119 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 120 cmeq vhas_nul2.16b, vdata2.16b, #0 121 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 122 /* Use a fast check for the termination condition. */ 123 orr vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b 124 orr vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b 125 orr vend1.16b, vhas_chr1.16b, vhas_chr2.16b 126 addp vend1.2d, vend1.2d, vend1.2d 127 mov tmp1, vend1.d[0] 128 cbz tmp1, .Lloop 129 130 /* Termination condition found. Now need to establish exactly why 131 we terminated. */ 132 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 133 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 134 addp vend1.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 135 addp vend1.16b, vend1.16b, vend1.16b // 128->64 136 137 mov tmp1, vend1.d[0] 138.Ltail: 139 /* Count the trailing zeros, by bit reversing... */ 140 rbit tmp1, tmp1 141 /* Re-bias source. */ 142 sub src, src, #32 143 clz tmp1, tmp1 /* ... and counting the leading zeros. */ 144 /* tmp1 is twice the offset into the fragment. */ 145 add result, src, tmp1, lsr #1 146 ret 147 148 .size strchrnul, . - strchrnul 149#endif 150