1/* Copyright (c) 2012-2018, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 26 27/* Assumptions: 28 * 29 * ARMv8-a, AArch64 30 */ 31 32#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 33/* See strcmp-stub.c */ 34#else 35 36 .macro def_fn f p2align=0 37 .text 38 .p2align \p2align 39 .global \f 40 .type \f, %function 41\f: 42 .endm 43 44#define L(label) .L ## label 45 46#define REP8_01 0x0101010101010101 47#define REP8_7f 0x7f7f7f7f7f7f7f7f 48#define REP8_80 0x8080808080808080 49 50/* Parameters and result. */ 51#define src1 x0 52#define src2 x1 53#define result x0 54 55/* Internal variables. */ 56#define data1 x2 57#define data1w w2 58#define data2 x3 59#define data2w w3 60#define has_nul x4 61#define diff x5 62#define syndrome x6 63#define tmp1 x7 64#define tmp2 x8 65#define tmp3 x9 66#define zeroones x10 67#define pos x11 68 69 /* Start of performance-critical section -- one 64B cache line. */ 70def_fn strcmp p2align=6 71 eor tmp1, src1, src2 72 mov zeroones, #REP8_01 73 tst tmp1, #7 74 b.ne L(misaligned8) 75 ands tmp1, src1, #7 76 b.ne L(mutual_align) 77 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 78 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 79 can be done in parallel across the entire word. */ 80L(loop_aligned): 81 ldr data1, [src1], #8 82 ldr data2, [src2], #8 83L(start_realigned): 84 sub tmp1, data1, zeroones 85 orr tmp2, data1, #REP8_7f 86 eor diff, data1, data2 /* Non-zero if differences found. */ 87 bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 88 orr syndrome, diff, has_nul 89 cbz syndrome, L(loop_aligned) 90 /* End of performance-critical section -- one 64B cache line. */ 91 92L(end): 93#ifndef __AARCH64EB__ 94 rev syndrome, syndrome 95 rev data1, data1 96 /* The MS-non-zero bit of the syndrome marks either the first bit 97 that is different, or the top bit of the first zero byte. 98 Shifting left now will bring the critical information into the 99 top bits. */ 100 clz pos, syndrome 101 rev data2, data2 102 lsl data1, data1, pos 103 lsl data2, data2, pos 104 /* But we need to zero-extend (char is unsigned) the value and then 105 perform a signed 32-bit subtraction. */ 106 lsr data1, data1, #56 107 sub result, data1, data2, lsr #56 108 ret 109#else 110 /* For big-endian we cannot use the trick with the syndrome value 111 as carry-propagation can corrupt the upper bits if the trailing 112 bytes in the string contain 0x01. */ 113 /* However, if there is no NUL byte in the dword, we can generate 114 the result directly. We can't just subtract the bytes as the 115 MSB might be significant. */ 116 cbnz has_nul, 1f 117 cmp data1, data2 118 cset result, ne 119 cneg result, result, lo 120 ret 1211: 122 /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 123 rev tmp3, data1 124 sub tmp1, tmp3, zeroones 125 orr tmp2, tmp3, #REP8_7f 126 bic has_nul, tmp1, tmp2 127 rev has_nul, has_nul 128 orr syndrome, diff, has_nul 129 clz pos, syndrome 130 /* The MS-non-zero bit of the syndrome marks either the first bit 131 that is different, or the top bit of the first zero byte. 132 Shifting left now will bring the critical information into the 133 top bits. */ 134 lsl data1, data1, pos 135 lsl data2, data2, pos 136 /* But we need to zero-extend (char is unsigned) the value and then 137 perform a signed 32-bit subtraction. */ 138 lsr data1, data1, #56 139 sub result, data1, data2, lsr #56 140 ret 141#endif 142 143L(mutual_align): 144 /* Sources are mutually aligned, but are not currently at an 145 alignment boundary. Round down the addresses and then mask off 146 the bytes that preceed the start point. */ 147 bic src1, src1, #7 148 bic src2, src2, #7 149 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ 150 ldr data1, [src1], #8 151 neg tmp1, tmp1 /* Bits to alignment -64. */ 152 ldr data2, [src2], #8 153 mov tmp2, #~0 154#ifdef __AARCH64EB__ 155 /* Big-endian. Early bytes are at MSB. */ 156 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ 157#else 158 /* Little-endian. Early bytes are at LSB. */ 159 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ 160#endif 161 orr data1, data1, tmp2 162 orr data2, data2, tmp2 163 b L(start_realigned) 164 165L(misaligned8): 166 /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always 167 checking to make sure that we don't access beyond page boundary in 168 SRC2. */ 169 tst src1, #7 170 b.eq L(loop_misaligned) 171L(do_misaligned): 172 ldrb data1w, [src1], #1 173 ldrb data2w, [src2], #1 174 cmp data1w, #1 175 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 176 b.ne L(done) 177 tst src1, #7 178 b.ne L(do_misaligned) 179 180L(loop_misaligned): 181 /* Test if we are within the last dword of the end of a 4K page. If 182 yes then jump back to the misaligned loop to copy a byte at a time. */ 183 and tmp1, src2, #0xff8 184 eor tmp1, tmp1, #0xff8 185 cbz tmp1, L(do_misaligned) 186 ldr data1, [src1], #8 187 ldr data2, [src2], #8 188 189 sub tmp1, data1, zeroones 190 orr tmp2, data1, #REP8_7f 191 eor diff, data1, data2 /* Non-zero if differences found. */ 192 bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 193 orr syndrome, diff, has_nul 194 cbz syndrome, L(loop_misaligned) 195 b L(end) 196 197L(done): 198 sub result, data1, data2 199 ret 200 .size strcmp, .-strcmp 201 202#endif 203