1/* strnlen - calculate the length of a string with limit. 2 3 Copyright (c) 2013, Linaro Limited 4 All rights reserved. 5 6 Redistribution and use in source and binary forms, with or without 7 modification, are permitted provided that the following conditions are met: 8 * Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 * Redistributions in binary form must reproduce the above copyright 11 notice, this list of conditions and the following disclaimer in the 12 documentation and/or other materials provided with the distribution. 13 * Neither the name of the Linaro nor the 14 names of its contributors may be used to endorse or promote products 15 derived from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 28 29#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 30/* See strlen-stub.c */ 31#else 32 33/* Assumptions: 34 * 35 * ARMv8-a, AArch64 36 */ 37 38/* Arguments and results. */ 39#define srcin x0 40#define len x0 41#define limit x1 42 43/* Locals and temporaries. */ 44#define src x2 45#define data1 x3 46#define data2 x4 47#define data2a x5 48#define has_nul1 x6 49#define has_nul2 x7 50#define tmp1 x8 51#define tmp2 x9 52#define tmp3 x10 53#define tmp4 x11 54#define zeroones x12 55#define pos x13 56#define limit_wd x14 57 58 .macro def_fn f p2align=0 59 .text 60 .p2align \p2align 61 .global \f 62 .type \f, %function 63\f: 64 .endm 65 66#define REP8_01 0x0101010101010101 67#define REP8_7f 0x7f7f7f7f7f7f7f7f 68#define REP8_80 0x8080808080808080 69 70 .text 71 .p2align 6 72.Lstart: 73 /* Pre-pad to ensure critical loop begins an icache line. */ 74 .rep 7 75 nop 76 .endr 77 /* Put this code here to avoid wasting more space with pre-padding. */ 78.Lhit_limit: 79 mov len, limit 80 ret 81 82def_fn strnlen 83 cbz limit, .Lhit_limit 84 mov zeroones, #REP8_01 85 bic src, srcin, #15 86 ands tmp1, srcin, #15 87 b.ne .Lmisaligned 88 /* Calculate the number of full and partial words -1. */ 89 sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ 90 lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ 91 92 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 93 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 94 can be done in parallel across the entire word. */ 95 /* The inner loop deals with two Dwords at a time. This has a 96 slightly higher start-up cost, but we should win quite quickly, 97 especially on cores with a high number of issue slots per 98 cycle, as we get much better parallelism out of the operations. */ 99 100 /* Start of critial section -- keep to one 64Byte cache line. */ 101.Lloop: 102 ldp data1, data2, [src], #16 103.Lrealigned: 104 sub tmp1, data1, zeroones 105 orr tmp2, data1, #REP8_7f 106 sub tmp3, data2, zeroones 107 orr tmp4, data2, #REP8_7f 108 bic has_nul1, tmp1, tmp2 109 bic has_nul2, tmp3, tmp4 110 subs limit_wd, limit_wd, #1 111 orr tmp1, has_nul1, has_nul2 112 ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ 113 b.eq .Lloop 114 /* End of critical section -- keep to one 64Byte cache line. */ 115 116 orr tmp1, has_nul1, has_nul2 117 cbz tmp1, .Lhit_limit /* No null in final Qword. */ 118 119 /* We know there's a null in the final Qword. The easiest thing 120 to do now is work out the length of the string and return 121 MIN (len, limit). */ 122 123 sub len, src, srcin 124 cbz has_nul1, .Lnul_in_data2 125#ifdef __AARCH64EB__ 126 mov data2, data1 127#endif 128 sub len, len, #8 129 mov has_nul2, has_nul1 130.Lnul_in_data2: 131#ifdef __AARCH64EB__ 132 /* For big-endian, carry propagation (if the final byte in the 133 string is 0x01) means we cannot use has_nul directly. The 134 easiest way to get the correct byte is to byte-swap the data 135 and calculate the syndrome a second time. */ 136 rev data2, data2 137 sub tmp1, data2, zeroones 138 orr tmp2, data2, #REP8_7f 139 bic has_nul2, tmp1, tmp2 140#endif 141 sub len, len, #8 142 rev has_nul2, has_nul2 143 clz pos, has_nul2 144 add len, len, pos, lsr #3 /* Bits to bytes. */ 145 cmp len, limit 146 csel len, len, limit, ls /* Return the lower value. */ 147 ret 148 149.Lmisaligned: 150 /* Deal with a partial first word. 151 We're doing two things in parallel here; 152 1) Calculate the number of words (but avoiding overflow if 153 limit is near ULONG_MAX) - to do this we need to work out 154 limit + tmp1 - 1 as a 65-bit value before shifting it; 155 2) Load and mask the initial data words - we force the bytes 156 before the ones we are interested in to 0xff - this ensures 157 early bytes will not hit any zero detection. */ 158 sub limit_wd, limit, #1 159 neg tmp4, tmp1 160 cmp tmp1, #8 161 162 and tmp3, limit_wd, #15 163 lsr limit_wd, limit_wd, #4 164 mov tmp2, #~0 165 166 ldp data1, data2, [src], #16 167 lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ 168 add tmp3, tmp3, tmp1 169 170#ifdef __AARCH64EB__ 171 /* Big-endian. Early bytes are at MSB. */ 172 lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ 173#else 174 /* Little-endian. Early bytes are at LSB. */ 175 lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ 176#endif 177 add limit_wd, limit_wd, tmp3, lsr #4 178 179 orr data1, data1, tmp2 180 orr data2a, data2, tmp2 181 182 csinv data1, data1, xzr, le 183 csel data2, data2, data2a, le 184 b .Lrealigned 185 .size strnlen, . - .Lstart /* Include pre-padding in size. */ 186 187#endif 188