1/* Copyright (c) 2013-2015, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) 27/* See strlen-stub.c */ 28#else 29 30/* Assumptions: 31 * 32 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 33 * Not MTE compatible. 34 */ 35 36#include "asmdefs.h" 37 38#define srcin x0 39#define len x0 40 41#define src x1 42#define data1 x2 43#define data2 x3 44#define has_nul1 x4 45#define has_nul2 x5 46#define tmp1 x4 47#define tmp2 x5 48#define tmp3 x6 49#define tmp4 x7 50#define zeroones x8 51 52#define maskv v0 53#define maskd d0 54#define dataq1 q1 55#define dataq2 q2 56#define datav1 v1 57#define datav2 v2 58#define tmp x2 59#define tmpw w2 60#define synd x3 61#define syndw w3 62#define shift x4 63 64/* For the first 32 bytes, NUL detection works on the principle that 65 (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a 66 byte is zero, and can be done in parallel across the entire word. */ 67 68#define REP8_01 0x0101010101010101 69#define REP8_7f 0x7f7f7f7f7f7f7f7f 70 71/* To test the page crossing code path more thoroughly, compile with 72 -DTEST_PAGE_CROSS - this will force all calls through the slower 73 entry path. This option is not intended for production use. */ 74 75#ifdef TEST_PAGE_CROSS 76# define MIN_PAGE_SIZE 32 77#else 78# define MIN_PAGE_SIZE 4096 79#endif 80 81/* Core algorithm: 82 83 Since strings are short on average, we check the first 32 bytes of the 84 string for a NUL character without aligning the string. In order to use 85 unaligned loads safely we must do a page cross check first. 86 87 If there is a NUL byte we calculate the length from the 2 8-byte words 88 using conditional select to reduce branch mispredictions (it is unlikely 89 strlen will be repeatedly called on strings with the same length). 90 91 If the string is longer than 32 bytes, align src so we don't need further 92 page cross checks, and process 32 bytes per iteration using a fast SIMD 93 loop. 94 95 If the page cross check fails, we read 32 bytes from an aligned address, 96 and ignore any characters before the string. If it contains a NUL 97 character, return the length, if not, continue in the main loop. */ 98 99ENTRY (strlen) 100 PTR_ARG (0) 101 and tmp1, srcin, MIN_PAGE_SIZE - 1 102 cmp tmp1, MIN_PAGE_SIZE - 32 103 b.hi L(page_cross) 104 105 /* Look for a NUL byte in the first 16 bytes. */ 106 ldp data1, data2, [srcin] 107 mov zeroones, REP8_01 108 109#ifdef __AARCH64EB__ 110 /* For big-endian, carry propagation (if the final byte in the 111 string is 0x01) means we cannot use has_nul1/2 directly. 112 Since we expect strings to be small and early-exit, 113 byte-swap the data now so has_null1/2 will be correct. */ 114 rev data1, data1 115 rev data2, data2 116#endif 117 sub tmp1, data1, zeroones 118 orr tmp2, data1, REP8_7f 119 sub tmp3, data2, zeroones 120 orr tmp4, data2, REP8_7f 121 bics has_nul1, tmp1, tmp2 122 bic has_nul2, tmp3, tmp4 123 ccmp has_nul2, 0, 0, eq 124 b.eq L(bytes16_31) 125 126 /* Find the exact offset of the first NUL byte in the first 16 bytes 127 from the string start. Enter with C = has_nul1 == 0. */ 128 csel has_nul1, has_nul1, has_nul2, cc 129 mov len, 8 130 rev has_nul1, has_nul1 131 csel len, xzr, len, cc 132 clz tmp1, has_nul1 133 add len, len, tmp1, lsr 3 134 ret 135 136 /* Look for a NUL byte at offset 16..31 in the string. */ 137L(bytes16_31): 138 ldp data1, data2, [srcin, 16] 139#ifdef __AARCH64EB__ 140 rev data1, data1 141 rev data2, data2 142#endif 143 sub tmp1, data1, zeroones 144 orr tmp2, data1, REP8_7f 145 sub tmp3, data2, zeroones 146 orr tmp4, data2, REP8_7f 147 bics has_nul1, tmp1, tmp2 148 bic has_nul2, tmp3, tmp4 149 ccmp has_nul2, 0, 0, eq 150 b.eq L(loop_entry) 151 152 /* Find the exact offset of the first NUL byte at offset 16..31 from 153 the string start. Enter with C = has_nul1 == 0. */ 154 csel has_nul1, has_nul1, has_nul2, cc 155 mov len, 24 156 rev has_nul1, has_nul1 157 mov tmp3, 16 158 clz tmp1, has_nul1 159 csel len, tmp3, len, cc 160 add len, len, tmp1, lsr 3 161 ret 162 163 nop 164L(loop_entry): 165 bic src, srcin, 31 166 167 .p2align 5 168L(loop): 169 ldp dataq1, dataq2, [src, 32]! 170 uminp maskv.16b, datav1.16b, datav2.16b 171 uminp maskv.16b, maskv.16b, maskv.16b 172 cmeq maskv.8b, maskv.8b, 0 173 fmov synd, maskd 174 cbz synd, L(loop) 175 176 /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ 177 cmeq maskv.16b, datav1.16b, 0 178 sub len, src, srcin 179 cbnz syndw, 1f 180 cmeq maskv.16b, datav2.16b, 0 181 add len, len, 16 1821: 183 /* Generate a bitmask and compute correct byte offset. */ 184 shrn maskv.8b, maskv.8h, 4 185 fmov synd, maskd 186#ifndef __AARCH64EB__ 187 rbit synd, synd 188#endif 189 clz tmp, synd 190 add len, len, tmp, lsr 2 191 ret 192 193L(page_cross): 194 bic src, srcin, 31 195 mov tmpw, 0x0c03 196 movk tmpw, 0xc030, lsl 16 197 ld1 {datav1.16b, datav2.16b}, [src] 198 dup maskv.4s, tmpw 199 cmeq datav1.16b, datav1.16b, 0 200 cmeq datav2.16b, datav2.16b, 0 201 and datav1.16b, datav1.16b, maskv.16b 202 and datav2.16b, datav2.16b, maskv.16b 203 addp maskv.16b, datav1.16b, datav2.16b 204 addp maskv.16b, maskv.16b, maskv.16b 205 fmov synd, maskd 206 lsl shift, srcin, 1 207 lsr synd, synd, shift 208 cbz synd, L(loop) 209 210 rbit synd, synd 211 clz len, synd 212 lsr len, len, 1 213 ret 214 215END (strlen) 216#endif 217