1/* Copyright (c) 2013, 2018, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 28/* See strcmp-stub.c */ 29#else 30 31/* Assumptions: 32 * 33 * ARMv8-a, AArch64 34 */ 35 36 .macro def_fn f p2align=0 37 .text 38 .p2align \p2align 39 .global \f 40 .type \f, %function 41\f: 42 .endm 43 44#define REP8_01 0x0101010101010101 45#define REP8_7f 0x7f7f7f7f7f7f7f7f 46#define REP8_80 0x8080808080808080 47 48/* Parameters and result. */ 49#define src1 x0 50#define src2 x1 51#define limit x2 52#define result x0 53 54/* Internal variables. */ 55#define data1 x3 56#define data1w w3 57#define data2 x4 58#define data2w w4 59#define has_nul x5 60#define diff x6 61#define syndrome x7 62#define tmp1 x8 63#define tmp2 x9 64#define tmp3 x10 65#define zeroones x11 66#define pos x12 67#define limit_wd x13 68#define mask x14 69#define endloop x15 70#define count mask 71 72 .text 73 .p2align 6 74 .rep 7 75 nop /* Pad so that the loop below fits a cache line. */ 76 .endr 77def_fn strncmp 78 cbz limit, .Lret0 79 eor tmp1, src1, src2 80 mov zeroones, #REP8_01 81 tst tmp1, #7 82 and count, src1, #7 83 b.ne .Lmisaligned8 84 cbnz count, .Lmutual_align 85 /* Calculate the number of full and partial words -1. */ 86 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 87 lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ 88 89 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 90 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 91 can be done in parallel across the entire word. */ 92 /* Start of performance-critical section -- one 64B cache line. */ 93.Lloop_aligned: 94 ldr data1, [src1], #8 95 ldr data2, [src2], #8 96.Lstart_realigned: 97 subs limit_wd, limit_wd, #1 98 sub tmp1, data1, zeroones 99 orr tmp2, data1, #REP8_7f 100 eor diff, data1, data2 /* Non-zero if differences found. */ 101 csinv endloop, diff, xzr, pl /* Last Dword or differences. */ 102 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 103 ccmp endloop, #0, #0, eq 104 b.eq .Lloop_aligned 105 /* End of performance-critical section -- one 64B cache line. */ 106 107 /* Not reached the limit, must have found the end or a diff. */ 108 tbz limit_wd, #63, .Lnot_limit 109 110 /* Limit % 8 == 0 => all bytes significant. */ 111 ands limit, limit, #7 112 b.eq .Lnot_limit 113 114 lsl limit, limit, #3 /* Bits -> bytes. */ 115 mov mask, #~0 116#ifdef __AARCH64EB__ 117 lsr mask, mask, limit 118#else 119 lsl mask, mask, limit 120#endif 121 bic data1, data1, mask 122 bic data2, data2, mask 123 124 /* Make sure that the NUL byte is marked in the syndrome. */ 125 orr has_nul, has_nul, mask 126 127.Lnot_limit: 128 orr syndrome, diff, has_nul 129 130#ifndef __AARCH64EB__ 131 rev syndrome, syndrome 132 rev data1, data1 133 /* The MS-non-zero bit of the syndrome marks either the first bit 134 that is different, or the top bit of the first zero byte. 135 Shifting left now will bring the critical information into the 136 top bits. */ 137 clz pos, syndrome 138 rev data2, data2 139 lsl data1, data1, pos 140 lsl data2, data2, pos 141 /* But we need to zero-extend (char is unsigned) the value and then 142 perform a signed 32-bit subtraction. */ 143 lsr data1, data1, #56 144 sub result, data1, data2, lsr #56 145 ret 146#else 147 /* For big-endian we cannot use the trick with the syndrome value 148 as carry-propagation can corrupt the upper bits if the trailing 149 bytes in the string contain 0x01. */ 150 /* However, if there is no NUL byte in the dword, we can generate 151 the result directly. We can't just subtract the bytes as the 152 MSB might be significant. */ 153 cbnz has_nul, 1f 154 cmp data1, data2 155 cset result, ne 156 cneg result, result, lo 157 ret 1581: 159 /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 160 rev tmp3, data1 161 sub tmp1, tmp3, zeroones 162 orr tmp2, tmp3, #REP8_7f 163 bic has_nul, tmp1, tmp2 164 rev has_nul, has_nul 165 orr syndrome, diff, has_nul 166 clz pos, syndrome 167 /* The MS-non-zero bit of the syndrome marks either the first bit 168 that is different, or the top bit of the first zero byte. 169 Shifting left now will bring the critical information into the 170 top bits. */ 171 lsl data1, data1, pos 172 lsl data2, data2, pos 173 /* But we need to zero-extend (char is unsigned) the value and then 174 perform a signed 32-bit subtraction. */ 175 lsr data1, data1, #56 176 sub result, data1, data2, lsr #56 177 ret 178#endif 179 180.Lmutual_align: 181 /* Sources are mutually aligned, but are not currently at an 182 alignment boundary. Round down the addresses and then mask off 183 the bytes that precede the start point. 184 We also need to adjust the limit calculations, but without 185 overflowing if the limit is near ULONG_MAX. */ 186 bic src1, src1, #7 187 bic src2, src2, #7 188 ldr data1, [src1], #8 189 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ 190 ldr data2, [src2], #8 191 mov tmp2, #~0 192 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 193#ifdef __AARCH64EB__ 194 /* Big-endian. Early bytes are at MSB. */ 195 lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ 196#else 197 /* Little-endian. Early bytes are at LSB. */ 198 lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ 199#endif 200 and tmp3, limit_wd, #7 201 lsr limit_wd, limit_wd, #3 202 /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ 203 add limit, limit, count 204 add tmp3, tmp3, count 205 orr data1, data1, tmp2 206 orr data2, data2, tmp2 207 add limit_wd, limit_wd, tmp3, lsr #3 208 b .Lstart_realigned 209 210 .p2align 6 211 /* Don't bother with dwords for up to 16 bytes. */ 212.Lmisaligned8: 213 cmp limit, #16 214 b.hs .Ltry_misaligned_words 215 216.Lbyte_loop: 217 /* Perhaps we can do better than this. */ 218 ldrb data1w, [src1], #1 219 ldrb data2w, [src2], #1 220 subs limit, limit, #1 221 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ 222 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 223 b.eq .Lbyte_loop 224.Ldone: 225 sub result, data1, data2 226 ret 227 /* Align the SRC1 to a dword by doing a bytewise compare and then do 228 the dword loop. */ 229.Ltry_misaligned_words: 230 lsr limit_wd, limit, #3 231 cbz count, .Ldo_misaligned 232 233 neg count, count 234 and count, count, #7 235 sub limit, limit, count 236 lsr limit_wd, limit, #3 237 238.Lpage_end_loop: 239 ldrb data1w, [src1], #1 240 ldrb data2w, [src2], #1 241 cmp data1w, #1 242 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 243 b.ne .Ldone 244 subs count, count, #1 245 b.hi .Lpage_end_loop 246 247.Ldo_misaligned: 248 /* Prepare ourselves for the next page crossing. Unlike the aligned 249 loop, we fetch 1 less dword because we risk crossing bounds on 250 SRC2. */ 251 mov count, #8 252 subs limit_wd, limit_wd, #1 253 b.lo .Ldone_loop 254.Lloop_misaligned: 255 and tmp2, src2, #0xff8 256 eor tmp2, tmp2, #0xff8 257 cbz tmp2, .Lpage_end_loop 258 259 ldr data1, [src1], #8 260 ldr data2, [src2], #8 261 sub tmp1, data1, zeroones 262 orr tmp2, data1, #REP8_7f 263 eor diff, data1, data2 /* Non-zero if differences found. */ 264 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 265 ccmp diff, #0, #0, eq 266 b.ne .Lnot_limit 267 subs limit_wd, limit_wd, #1 268 b.pl .Lloop_misaligned 269 270.Ldone_loop: 271 /* We found a difference or a NULL before the limit was reached. */ 272 and limit, limit, #7 273 cbz limit, .Lnot_limit 274 /* Read the last word. */ 275 sub src1, src1, 8 276 sub src2, src2, 8 277 ldr data1, [src1, limit] 278 ldr data2, [src2, limit] 279 sub tmp1, data1, zeroones 280 orr tmp2, data1, #REP8_7f 281 eor diff, data1, data2 /* Non-zero if differences found. */ 282 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 283 ccmp diff, #0, #0, eq 284 b.ne .Lnot_limit 285 286.Lret0: 287 mov result, #0 288 ret 289 .size strncmp, . - strncmp 290#endif 291