1/* Copyright (c) 2013-2015, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26#include <picolibc.h> 27 28#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) || !defined(__ARM_NEON) 29/* See strlen-stub.c */ 30#else 31 32/* Assumptions: 33 * 34 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 35 * Not MTE compatible. 36 */ 37 38#include "asmdefs.h" 39 40#define srcin x0 41#define len x0 42 43#define src x1 44#define data1 x2 45#define data2 x3 46#define has_nul1 x4 47#define has_nul2 x5 48#define tmp1 x4 49#define tmp2 x5 50#define tmp3 x6 51#define tmp4 x7 52#define zeroones x8 53 54#define maskv v0 55#define maskd d0 56#define dataq1 q1 57#define dataq2 q2 58#define datav1 v1 59#define datav2 v2 60#define tmp x2 61#define tmpw w2 62#define synd x3 63#define syndw w3 64#define shift x4 65 66/* For the first 32 bytes, NUL detection works on the principle that 67 (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a 68 byte is zero, and can be done in parallel across the entire word. */ 69 70#define REP8_01 0x0101010101010101 71#define REP8_7f 0x7f7f7f7f7f7f7f7f 72 73/* To test the page crossing code path more thoroughly, compile with 74 -DTEST_PAGE_CROSS - this will force all calls through the slower 75 entry path. This option is not intended for production use. */ 76 77#ifdef TEST_PAGE_CROSS 78# define MIN_PAGE_SIZE 32 79#else 80# define MIN_PAGE_SIZE 4096 81#endif 82 83/* Core algorithm: 84 85 Since strings are short on average, we check the first 32 bytes of the 86 string for a NUL character without aligning the string. In order to use 87 unaligned loads safely we must do a page cross check first. 88 89 If there is a NUL byte we calculate the length from the 2 8-byte words 90 using conditional select to reduce branch mispredictions (it is unlikely 91 strlen will be repeatedly called on strings with the same length). 92 93 If the string is longer than 32 bytes, align src so we don't need further 94 page cross checks, and process 32 bytes per iteration using a fast SIMD 95 loop. 96 97 If the page cross check fails, we read 32 bytes from an aligned address, 98 and ignore any characters before the string. If it contains a NUL 99 character, return the length, if not, continue in the main loop. */ 100 101ENTRY (strlen) 102 PTR_ARG (0) 103 and tmp1, srcin, MIN_PAGE_SIZE - 1 104 cmp tmp1, MIN_PAGE_SIZE - 32 105 b.hi L(page_cross) 106 107 /* Look for a NUL byte in the first 16 bytes. */ 108 ldp data1, data2, [srcin] 109 mov zeroones, REP8_01 110 111#ifdef __AARCH64EB__ 112 /* For big-endian, carry propagation (if the final byte in the 113 string is 0x01) means we cannot use has_nul1/2 directly. 114 Since we expect strings to be small and early-exit, 115 byte-swap the data now so has_null1/2 will be correct. */ 116 rev data1, data1 117 rev data2, data2 118#endif 119 sub tmp1, data1, zeroones 120 orr tmp2, data1, REP8_7f 121 sub tmp3, data2, zeroones 122 orr tmp4, data2, REP8_7f 123 bics has_nul1, tmp1, tmp2 124 bic has_nul2, tmp3, tmp4 125 ccmp has_nul2, 0, 0, eq 126 b.eq L(bytes16_31) 127 128 /* Find the exact offset of the first NUL byte in the first 16 bytes 129 from the string start. Enter with C = has_nul1 == 0. */ 130 csel has_nul1, has_nul1, has_nul2, cc 131 mov len, 8 132 rev has_nul1, has_nul1 133 csel len, xzr, len, cc 134 clz tmp1, has_nul1 135 add len, len, tmp1, lsr 3 136 ret 137 138 /* Look for a NUL byte at offset 16..31 in the string. */ 139L(bytes16_31): 140 ldp data1, data2, [srcin, 16] 141#ifdef __AARCH64EB__ 142 rev data1, data1 143 rev data2, data2 144#endif 145 sub tmp1, data1, zeroones 146 orr tmp2, data1, REP8_7f 147 sub tmp3, data2, zeroones 148 orr tmp4, data2, REP8_7f 149 bics has_nul1, tmp1, tmp2 150 bic has_nul2, tmp3, tmp4 151 ccmp has_nul2, 0, 0, eq 152 b.eq L(loop_entry) 153 154 /* Find the exact offset of the first NUL byte at offset 16..31 from 155 the string start. Enter with C = has_nul1 == 0. */ 156 csel has_nul1, has_nul1, has_nul2, cc 157 mov len, 24 158 rev has_nul1, has_nul1 159 mov tmp3, 16 160 clz tmp1, has_nul1 161 csel len, tmp3, len, cc 162 add len, len, tmp1, lsr 3 163 ret 164 165 nop 166L(loop_entry): 167 bic src, srcin, 31 168 169 .p2align 5 170L(loop): 171 ldp dataq1, dataq2, [src, 32]! 172 uminp maskv.16b, datav1.16b, datav2.16b 173 uminp maskv.16b, maskv.16b, maskv.16b 174 cmeq maskv.8b, maskv.8b, 0 175 fmov synd, maskd 176 cbz synd, L(loop) 177 178 /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ 179 cmeq maskv.16b, datav1.16b, 0 180 sub len, src, srcin 181 cbnz syndw, 1f 182 cmeq maskv.16b, datav2.16b, 0 183 add len, len, 16 1841: 185 /* Generate a bitmask and compute correct byte offset. */ 186 shrn maskv.8b, maskv.8h, 4 187 fmov synd, maskd 188#ifndef __AARCH64EB__ 189 rbit synd, synd 190#endif 191 clz tmp, synd 192 add len, len, tmp, lsr 2 193 ret 194 195L(page_cross): 196 bic src, srcin, 31 197 mov tmpw, 0x0c03 198 movk tmpw, 0xc030, lsl 16 199 ld1 {datav1.16b, datav2.16b}, [src] 200 dup maskv.4s, tmpw 201 cmeq datav1.16b, datav1.16b, 0 202 cmeq datav2.16b, datav2.16b, 0 203 and datav1.16b, datav1.16b, maskv.16b 204 and datav2.16b, datav2.16b, maskv.16b 205 addp maskv.16b, datav1.16b, datav2.16b 206 addp maskv.16b, maskv.16b, maskv.16b 207 fmov synd, maskd 208 lsl shift, srcin, 1 209 lsr synd, synd, shift 210 cbz synd, L(loop) 211 212 rbit synd, synd 213 clz len, synd 214 lsr len, len, 1 215 ret 216 217END (strlen) 218#endif 219