1/* Copyright (c) 2013, 2018, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27#include <picolibc.h> 28 29#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 30/* See strncmp-stub.c */ 31#else 32 33/* Assumptions: 34 * 35 * ARMv8-a, AArch64. 36 * MTE compatible. 37 */ 38 39#include "asmdefs.h" 40 41#define REP8_01 0x0101010101010101 42#define REP8_7f 0x7f7f7f7f7f7f7f7f 43 44/* Parameters and result. */ 45#define src1 x0 46#define src2 x1 47#define limit x2 48#define result x0 49 50/* Internal variables. */ 51#define data1 x3 52#define data1w w3 53#define data2 x4 54#define data2w w4 55#define has_nul x5 56#define diff x6 57#define syndrome x7 58#define tmp1 x8 59#define tmp2 x9 60#define tmp3 x10 61#define zeroones x11 62#define pos x12 63#define mask x13 64#define endloop x14 65#define count mask 66#define offset pos 67#define neg_offset x15 68 69/* Define endian dependent shift operations. 70 On big-endian early bytes are at MSB and on little-endian LSB. 71 LS_FW means shifting towards early bytes. 72 LS_BK means shifting towards later bytes. 73 */ 74#ifdef __AARCH64EB__ 75#define LS_FW lsl 76#define LS_BK lsr 77#else 78#define LS_FW lsr 79#define LS_BK lsl 80#endif 81 82ENTRY (strncmp) 83 PTR_ARG (0) 84 PTR_ARG (1) 85 SIZE_ARG (2) 86 cbz limit, L(ret0) 87 eor tmp1, src1, src2 88 mov zeroones, #REP8_01 89 tst tmp1, #7 90 and count, src1, #7 91 b.ne L(misaligned8) 92 cbnz count, L(mutual_align) 93 94 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 95 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 96 can be done in parallel across the entire word. */ 97 .p2align 4 98L(loop_aligned): 99 ldr data1, [src1], #8 100 ldr data2, [src2], #8 101L(start_realigned): 102 subs limit, limit, #8 103 sub tmp1, data1, zeroones 104 orr tmp2, data1, #REP8_7f 105 eor diff, data1, data2 /* Non-zero if differences found. */ 106 csinv endloop, diff, xzr, hi /* Last Dword or differences. */ 107 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 108 ccmp endloop, #0, #0, eq 109 b.eq L(loop_aligned) 110 /* End of main loop */ 111 112L(full_check): 113#ifndef __AARCH64EB__ 114 orr syndrome, diff, has_nul 115 add limit, limit, 8 /* Rewind limit to before last subs. */ 116L(syndrome_check): 117 /* Limit was reached. Check if the NUL byte or the difference 118 is before the limit. */ 119 rev syndrome, syndrome 120 rev data1, data1 121 clz pos, syndrome 122 rev data2, data2 123 lsl data1, data1, pos 124 cmp limit, pos, lsr #3 125 lsl data2, data2, pos 126 /* But we need to zero-extend (char is unsigned) the value and then 127 perform a signed 32-bit subtraction. */ 128 lsr data1, data1, #56 129 sub result, data1, data2, lsr #56 130 csel result, result, xzr, hi 131 ret 132#else 133 /* Not reached the limit, must have found the end or a diff. */ 134 tbz limit, #63, L(not_limit) 135 add tmp1, limit, 8 136 cbz limit, L(not_limit) 137 138 lsl limit, tmp1, #3 /* Bits -> bytes. */ 139 mov mask, #~0 140 lsr mask, mask, limit 141 bic data1, data1, mask 142 bic data2, data2, mask 143 144 /* Make sure that the NUL byte is marked in the syndrome. */ 145 orr has_nul, has_nul, mask 146 147L(not_limit): 148 /* For big-endian we cannot use the trick with the syndrome value 149 as carry-propagation can corrupt the upper bits if the trailing 150 bytes in the string contain 0x01. */ 151 /* However, if there is no NUL byte in the dword, we can generate 152 the result directly. We can't just subtract the bytes as the 153 MSB might be significant. */ 154 cbnz has_nul, 1f 155 cmp data1, data2 156 cset result, ne 157 cneg result, result, lo 158 ret 1591: 160 /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 161 rev tmp3, data1 162 sub tmp1, tmp3, zeroones 163 orr tmp2, tmp3, #REP8_7f 164 bic has_nul, tmp1, tmp2 165 rev has_nul, has_nul 166 orr syndrome, diff, has_nul 167 clz pos, syndrome 168 /* The most-significant-non-zero bit of the syndrome marks either the 169 first bit that is different, or the top bit of the first zero byte. 170 Shifting left now will bring the critical information into the 171 top bits. */ 172L(end_quick): 173 lsl data1, data1, pos 174 lsl data2, data2, pos 175 /* But we need to zero-extend (char is unsigned) the value and then 176 perform a signed 32-bit subtraction. */ 177 lsr data1, data1, #56 178 sub result, data1, data2, lsr #56 179 ret 180#endif 181 182L(mutual_align): 183 /* Sources are mutually aligned, but are not currently at an 184 alignment boundary. Round down the addresses and then mask off 185 the bytes that precede the start point. 186 We also need to adjust the limit calculations, but without 187 overflowing if the limit is near ULONG_MAX. */ 188 bic src1, src1, #7 189 bic src2, src2, #7 190 ldr data1, [src1], #8 191 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ 192 ldr data2, [src2], #8 193 mov tmp2, #~0 194 LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ 195 /* Adjust the limit and ensure it doesn't overflow. */ 196 adds limit, limit, count 197 csinv limit, limit, xzr, lo 198 orr data1, data1, tmp2 199 orr data2, data2, tmp2 200 b L(start_realigned) 201 202 .p2align 4 203 /* Don't bother with dwords for up to 16 bytes. */ 204L(misaligned8): 205 cmp limit, #16 206 b.hs L(try_misaligned_words) 207 208L(byte_loop): 209 /* Perhaps we can do better than this. */ 210 ldrb data1w, [src1], #1 211 ldrb data2w, [src2], #1 212 subs limit, limit, #1 213 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ 214 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 215 b.eq L(byte_loop) 216L(done): 217 sub result, data1, data2 218 ret 219 /* Align the SRC1 to a dword by doing a bytewise compare and then do 220 the dword loop. */ 221L(try_misaligned_words): 222 cbz count, L(src1_aligned) 223 224 neg count, count 225 and count, count, #7 226 sub limit, limit, count 227 228L(page_end_loop): 229 ldrb data1w, [src1], #1 230 ldrb data2w, [src2], #1 231 cmp data1w, #1 232 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 233 b.ne L(done) 234 subs count, count, #1 235 b.hi L(page_end_loop) 236 237 /* The following diagram explains the comparison of misaligned strings. 238 The bytes are shown in natural order. For little-endian, it is 239 reversed in the registers. The "x" bytes are before the string. 240 The "|" separates data that is loaded at one time. 241 src1 | a a a a a a a a | b b b c c c c c | . . . 242 src2 | x x x x x a a a a a a a a b b b | c c c c c . . . 243 244 After shifting in each step, the data looks like this: 245 STEP_A STEP_B STEP_C 246 data1 a a a a a a a a b b b c c c c c b b b c c c c c 247 data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c 248 249 The bytes with "0" are eliminated from the syndrome via mask. 250 251 Align SRC2 down to 16 bytes. This way we can read 16 bytes at a 252 time from SRC2. The comparison happens in 3 steps. After each step 253 the loop can exit, or read from SRC1 or SRC2. */ 254L(src1_aligned): 255 /* Calculate offset from 8 byte alignment to string start in bits. No 256 need to mask offset since shifts are ignoring upper bits. */ 257 lsl offset, src2, #3 258 bic src2, src2, #0xf 259 mov mask, -1 260 neg neg_offset, offset 261 ldr data1, [src1], #8 262 ldp tmp1, tmp2, [src2], #16 263 LS_BK mask, mask, neg_offset 264 and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ 265 /* Skip the first compare if data in tmp1 is irrelevant. */ 266 tbnz offset, 6, L(misaligned_mid_loop) 267 268L(loop_misaligned): 269 /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ 270 LS_FW data2, tmp1, offset 271 LS_BK tmp1, tmp2, neg_offset 272 subs limit, limit, #8 273 orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ 274 sub has_nul, data1, zeroones 275 eor diff, data1, data2 /* Non-zero if differences found. */ 276 orr tmp3, data1, #REP8_7f 277 csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ 278 bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ 279 orr tmp3, endloop, has_nul 280 cbnz tmp3, L(full_check) 281 282 ldr data1, [src1], #8 283L(misaligned_mid_loop): 284 /* STEP_B: Compare first part of data1 to second part of tmp2. */ 285 LS_FW data2, tmp2, offset 286#ifdef __AARCH64EB__ 287 /* For big-endian we do a byte reverse to avoid carry-propagation 288 problem described above. This way we can reuse the has_nul in the 289 next step and also use syndrome value trick at the end. */ 290 rev tmp3, data1 291 #define data1_fixed tmp3 292#else 293 #define data1_fixed data1 294#endif 295 sub has_nul, data1_fixed, zeroones 296 orr tmp3, data1_fixed, #REP8_7f 297 eor diff, data2, data1 /* Non-zero if differences found. */ 298 bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ 299#ifdef __AARCH64EB__ 300 rev has_nul, has_nul 301#endif 302 cmp limit, neg_offset, lsr #3 303 orr syndrome, diff, has_nul 304 bic syndrome, syndrome, mask /* Ignore later bytes. */ 305 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 306 cbnz tmp3, L(syndrome_check) 307 308 /* STEP_C: Compare second part of data1 to first part of tmp1. */ 309 ldp tmp1, tmp2, [src2], #16 310 cmp limit, #8 311 LS_BK data2, tmp1, neg_offset 312 eor diff, data2, data1 /* Non-zero if differences found. */ 313 orr syndrome, diff, has_nul 314 and syndrome, syndrome, mask /* Ignore earlier bytes. */ 315 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 316 cbnz tmp3, L(syndrome_check) 317 318 ldr data1, [src1], #8 319 sub limit, limit, #8 320 b L(loop_misaligned) 321 322#ifdef __AARCH64EB__ 323L(syndrome_check): 324 clz pos, syndrome 325 cmp pos, limit, lsl #3 326 b.lo L(end_quick) 327#endif 328 329L(ret0): 330 mov result, #0 331 ret 332END(strncmp) 333#endif 334