1/* Copyright (c) 2013-2015, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 28/* See strlen-stub.c */ 29#else 30 31/* Assumptions: 32 * 33 * ARMv8-a, AArch64, unaligned accesses, min page size 4k. 34 */ 35 36/* To test the page crossing code path more thoroughly, compile with 37 -DTEST_PAGE_CROSS - this will force all calls through the slower 38 entry path. This option is not intended for production use. */ 39 40/* Arguments and results. */ 41#define srcin x0 42#define len x0 43 44/* Locals and temporaries. */ 45#define src x1 46#define data1 x2 47#define data2 x3 48#define has_nul1 x4 49#define has_nul2 x5 50#define tmp1 x4 51#define tmp2 x5 52#define tmp3 x6 53#define tmp4 x7 54#define zeroones x8 55 56#define L(l) .L ## l 57 58 .macro def_fn f p2align=0 59 .text 60 .p2align \p2align 61 .global \f 62 .type \f, %function 63\f: 64 .endm 65 66 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 67 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 68 can be done in parallel across the entire word. A faster check 69 (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives 70 false hits for characters 129..255. */ 71 72#define REP8_01 0x0101010101010101 73#define REP8_7f 0x7f7f7f7f7f7f7f7f 74#define REP8_80 0x8080808080808080 75 76#ifdef TEST_PAGE_CROSS 77# define MIN_PAGE_SIZE 15 78#else 79# define MIN_PAGE_SIZE 4096 80#endif 81 82 /* Since strings are short on average, we check the first 16 bytes 83 of the string for a NUL character. In order to do an unaligned ldp 84 safely we have to do a page cross check first. If there is a NUL 85 byte we calculate the length from the 2 8-byte words using 86 conditional select to reduce branch mispredictions (it is unlikely 87 strlen will be repeatedly called on strings with the same length). 88 89 If the string is longer than 16 bytes, we align src so don't need 90 further page cross checks, and process 32 bytes per iteration 91 using the fast NUL check. If we encounter non-ASCII characters, 92 fallback to a second loop using the full NUL check. 93 94 If the page cross check fails, we read 16 bytes from an aligned 95 address, remove any characters before the string, and continue 96 in the main loop using aligned loads. Since strings crossing a 97 page in the first 16 bytes are rare (probability of 98 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. 99 100 AArch64 systems have a minimum page size of 4k. We don't bother 101 checking for larger page sizes - the cost of setting up the correct 102 page size is just not worth the extra gain from a small reduction in 103 the cases taking the slow path. Note that we only care about 104 whether the first fetch, which may be misaligned, crosses a page 105 boundary. */ 106 107def_fn strlen p2align=6 108 and tmp1, srcin, MIN_PAGE_SIZE - 1 109 mov zeroones, REP8_01 110 cmp tmp1, MIN_PAGE_SIZE - 16 111 b.gt L(page_cross) 112 ldp data1, data2, [srcin] 113#ifdef __AARCH64EB__ 114 /* For big-endian, carry propagation (if the final byte in the 115 string is 0x01) means we cannot use has_nul1/2 directly. 116 Since we expect strings to be small and early-exit, 117 byte-swap the data now so has_null1/2 will be correct. */ 118 rev data1, data1 119 rev data2, data2 120#endif 121 sub tmp1, data1, zeroones 122 orr tmp2, data1, REP8_7f 123 sub tmp3, data2, zeroones 124 orr tmp4, data2, REP8_7f 125 bics has_nul1, tmp1, tmp2 126 bic has_nul2, tmp3, tmp4 127 ccmp has_nul2, 0, 0, eq 128 beq L(main_loop_entry) 129 130 /* Enter with C = has_nul1 == 0. */ 131 csel has_nul1, has_nul1, has_nul2, cc 132 mov len, 8 133 rev has_nul1, has_nul1 134 clz tmp1, has_nul1 135 csel len, xzr, len, cc 136 add len, len, tmp1, lsr 3 137 ret 138 139 /* The inner loop processes 32 bytes per iteration and uses the fast 140 NUL check. If we encounter non-ASCII characters, use a second 141 loop with the accurate NUL check. */ 142 .p2align 4 143L(main_loop_entry): 144 bic src, srcin, 15 145 sub src, src, 16 146L(main_loop): 147 ldp data1, data2, [src, 32]! 148.Lpage_cross_entry: 149 sub tmp1, data1, zeroones 150 sub tmp3, data2, zeroones 151 orr tmp2, tmp1, tmp3 152 tst tmp2, zeroones, lsl 7 153 bne 1f 154 ldp data1, data2, [src, 16] 155 sub tmp1, data1, zeroones 156 sub tmp3, data2, zeroones 157 orr tmp2, tmp1, tmp3 158 tst tmp2, zeroones, lsl 7 159 beq L(main_loop) 160 add src, src, 16 1611: 162 /* The fast check failed, so do the slower, accurate NUL check. */ 163 orr tmp2, data1, REP8_7f 164 orr tmp4, data2, REP8_7f 165 bics has_nul1, tmp1, tmp2 166 bic has_nul2, tmp3, tmp4 167 ccmp has_nul2, 0, 0, eq 168 beq L(nonascii_loop) 169 170 /* Enter with C = has_nul1 == 0. */ 171L(tail): 172#ifdef __AARCH64EB__ 173 /* For big-endian, carry propagation (if the final byte in the 174 string is 0x01) means we cannot use has_nul1/2 directly. The 175 easiest way to get the correct byte is to byte-swap the data 176 and calculate the syndrome a second time. */ 177 csel data1, data1, data2, cc 178 rev data1, data1 179 sub tmp1, data1, zeroones 180 orr tmp2, data1, REP8_7f 181 bic has_nul1, tmp1, tmp2 182#else 183 csel has_nul1, has_nul1, has_nul2, cc 184#endif 185 sub len, src, srcin 186 rev has_nul1, has_nul1 187 add tmp2, len, 8 188 clz tmp1, has_nul1 189 csel len, len, tmp2, cc 190 add len, len, tmp1, lsr 3 191 ret 192 193L(nonascii_loop): 194 ldp data1, data2, [src, 16]! 195 sub tmp1, data1, zeroones 196 orr tmp2, data1, REP8_7f 197 sub tmp3, data2, zeroones 198 orr tmp4, data2, REP8_7f 199 bics has_nul1, tmp1, tmp2 200 bic has_nul2, tmp3, tmp4 201 ccmp has_nul2, 0, 0, eq 202 bne L(tail) 203 ldp data1, data2, [src, 16]! 204 sub tmp1, data1, zeroones 205 orr tmp2, data1, REP8_7f 206 sub tmp3, data2, zeroones 207 orr tmp4, data2, REP8_7f 208 bics has_nul1, tmp1, tmp2 209 bic has_nul2, tmp3, tmp4 210 ccmp has_nul2, 0, 0, eq 211 beq L(nonascii_loop) 212 b L(tail) 213 214 /* Load 16 bytes from [srcin & ~15] and force the bytes that precede 215 srcin to 0x7f, so we ignore any NUL bytes before the string. 216 Then continue in the aligned loop. */ 217L(page_cross): 218 bic src, srcin, 15 219 ldp data1, data2, [src] 220 lsl tmp1, srcin, 3 221 mov tmp4, -1 222#ifdef __AARCH64EB__ 223 /* Big-endian. Early bytes are at MSB. */ 224 lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ 225#else 226 /* Little-endian. Early bytes are at LSB. */ 227 lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ 228#endif 229 orr tmp1, tmp1, REP8_80 230 orn data1, data1, tmp1 231 orn tmp2, data2, tmp1 232 tst srcin, 8 233 csel data1, data1, tmp4, eq 234 csel data2, data2, tmp2, eq 235 b L(page_cross_entry) 236 237 .size strlen, . - strlen 238#endif 239