1/* 2 strcpy/stpcpy - copy a string returning pointer to start/end. 3 4 Copyright (c) 2013, 2014, 2015 ARM Ltd. 5 All Rights Reserved. 6 7 Redistribution and use in source and binary forms, with or without 8 modification, are permitted provided that the following conditions are met: 9 * Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 * Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 * Neither the name of the company nor the names of its contributors 15 may be used to endorse or promote products derived from this 16 software without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 29 30#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) 31/* See strchr-stub.c */ 32#else 33 34/* Assumptions: 35 * 36 * ARMv8-a, AArch64, unaligned accesses, min page size 4k. 37 */ 38 39/* To build as stpcpy, define BUILD_STPCPY before compiling this file. 40 41 To test the page crossing code path more thoroughly, compile with 42 -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower 43 entry path. This option is not intended for production use. */ 44 45/* Arguments and results. */ 46#define dstin x0 47#define srcin x1 48 49/* Locals and temporaries. */ 50#define src x2 51#define dst x3 52#define data1 x4 53#define data1w w4 54#define data2 x5 55#define data2w w5 56#define has_nul1 x6 57#define has_nul2 x7 58#define tmp1 x8 59#define tmp2 x9 60#define tmp3 x10 61#define tmp4 x11 62#define zeroones x12 63#define data1a x13 64#define data2a x14 65#define pos x15 66#define len x16 67#define to_align x17 68 69#ifdef BUILD_STPCPY 70#define STRCPY stpcpy 71#else 72#define STRCPY strcpy 73#endif 74 75 .macro def_fn f p2align=0 76 .text 77 .p2align \p2align 78 .global \f 79 .type \f, %function 80\f: 81 .endm 82 83 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 84 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 85 can be done in parallel across the entire word. */ 86 87#define REP8_01 0x0101010101010101 88#define REP8_7f 0x7f7f7f7f7f7f7f7f 89#define REP8_80 0x8080808080808080 90 91 /* AArch64 systems have a minimum page size of 4k. We can do a quick 92 page size check for crossing this boundary on entry and if we 93 do not, then we can short-circuit much of the entry code. We 94 expect early page-crossing strings to be rare (probability of 95 16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite 96 predictable, even with random strings. 97 98 We don't bother checking for larger page sizes, the cost of setting 99 up the correct page size is just not worth the extra gain from 100 a small reduction in the cases taking the slow path. Note that 101 we only care about whether the first fetch, which may be 102 misaligned, crosses a page boundary - after that we move to aligned 103 fetches for the remainder of the string. */ 104 105#ifdef STRCPY_TEST_PAGE_CROSS 106 /* Make everything that isn't Qword aligned look like a page cross. */ 107#define MIN_PAGE_P2 4 108#else 109#define MIN_PAGE_P2 12 110#endif 111 112#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2) 113 114def_fn STRCPY p2align=6 115 /* For moderately short strings, the fastest way to do the copy is to 116 calculate the length of the string in the same way as strlen, then 117 essentially do a memcpy of the result. This avoids the need for 118 multiple byte copies and further means that by the time we 119 reach the bulk copy loop we know we can always use DWord 120 accesses. We expect strcpy to rarely be called repeatedly 121 with the same source string, so branch prediction is likely to 122 always be difficult - we mitigate against this by preferring 123 conditional select operations over branches whenever this is 124 feasible. */ 125 and tmp2, srcin, #(MIN_PAGE_SIZE - 1) 126 mov zeroones, #REP8_01 127 and to_align, srcin, #15 128 cmp tmp2, #(MIN_PAGE_SIZE - 16) 129 neg tmp1, to_align 130 /* The first fetch will straddle a (possible) page boundary iff 131 srcin + 15 causes bit[MIN_PAGE_P2] to change value. A 16-byte 132 aligned string will never fail the page align check, so will 133 always take the fast path. */ 134 b.gt .Lpage_cross 135 136.Lpage_cross_ok: 137 ldp data1, data2, [srcin] 138#ifdef __AARCH64EB__ 139 /* Because we expect the end to be found within 16 characters 140 (profiling shows this is the most common case), it's worth 141 swapping the bytes now to save having to recalculate the 142 termination syndrome later. We preserve data1 and data2 143 so that we can re-use the values later on. */ 144 rev tmp2, data1 145 sub tmp1, tmp2, zeroones 146 orr tmp2, tmp2, #REP8_7f 147 bics has_nul1, tmp1, tmp2 148 b.ne .Lfp_le8 149 rev tmp4, data2 150 sub tmp3, tmp4, zeroones 151 orr tmp4, tmp4, #REP8_7f 152#else 153 sub tmp1, data1, zeroones 154 orr tmp2, data1, #REP8_7f 155 bics has_nul1, tmp1, tmp2 156 b.ne .Lfp_le8 157 sub tmp3, data2, zeroones 158 orr tmp4, data2, #REP8_7f 159#endif 160 bics has_nul2, tmp3, tmp4 161 b.eq .Lbulk_entry 162 163 /* The string is short (<=16 bytes). We don't know exactly how 164 short though, yet. Work out the exact length so that we can 165 quickly select the optimal copy strategy. */ 166.Lfp_gt8: 167 rev has_nul2, has_nul2 168 clz pos, has_nul2 169 mov tmp2, #56 170 add dst, dstin, pos, lsr #3 /* Bits to bytes. */ 171 sub pos, tmp2, pos 172#ifdef __AARCH64EB__ 173 lsr data2, data2, pos 174#else 175 lsl data2, data2, pos 176#endif 177 str data2, [dst, #1] 178 str data1, [dstin] 179#ifdef BUILD_STPCPY 180 add dstin, dst, #8 181#endif 182 ret 183 184.Lfp_le8: 185 rev has_nul1, has_nul1 186 clz pos, has_nul1 187 add dst, dstin, pos, lsr #3 /* Bits to bytes. */ 188 subs tmp2, pos, #24 /* Pos in bits. */ 189 b.lt .Lfp_lt4 190#ifdef __AARCH64EB__ 191 mov tmp2, #56 192 sub pos, tmp2, pos 193 lsr data2, data1, pos 194 lsr data1, data1, #32 195#else 196 lsr data2, data1, tmp2 197#endif 198 /* 4->7 bytes to copy. */ 199 str data2w, [dst, #-3] 200 str data1w, [dstin] 201#ifdef BUILD_STPCPY 202 mov dstin, dst 203#endif 204 ret 205.Lfp_lt4: 206 cbz pos, .Lfp_lt2 207 /* 2->3 bytes to copy. */ 208#ifdef __AARCH64EB__ 209 lsr data1, data1, #48 210#endif 211 strh data1w, [dstin] 212 /* Fall-through, one byte (max) to go. */ 213.Lfp_lt2: 214 /* Null-terminated string. Last character must be zero! */ 215 strb wzr, [dst] 216#ifdef BUILD_STPCPY 217 mov dstin, dst 218#endif 219 ret 220 221 .p2align 6 222 /* Aligning here ensures that the entry code and main loop all lies 223 within one 64-byte cache line. */ 224.Lbulk_entry: 225 sub to_align, to_align, #16 226 stp data1, data2, [dstin] 227 sub src, srcin, to_align 228 sub dst, dstin, to_align 229 b .Lentry_no_page_cross 230 231 /* The inner loop deals with two Dwords at a time. This has a 232 slightly higher start-up cost, but we should win quite quickly, 233 especially on cores with a high number of issue slots per 234 cycle, as we get much better parallelism out of the operations. */ 235.Lmain_loop: 236 stp data1, data2, [dst], #16 237.Lentry_no_page_cross: 238 ldp data1, data2, [src], #16 239 sub tmp1, data1, zeroones 240 orr tmp2, data1, #REP8_7f 241 sub tmp3, data2, zeroones 242 orr tmp4, data2, #REP8_7f 243 bic has_nul1, tmp1, tmp2 244 bics has_nul2, tmp3, tmp4 245 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ 246 b.eq .Lmain_loop 247 248 /* Since we know we are copying at least 16 bytes, the fastest way 249 to deal with the tail is to determine the location of the 250 trailing NUL, then (re)copy the 16 bytes leading up to that. */ 251 cmp has_nul1, #0 252#ifdef __AARCH64EB__ 253 /* For big-endian, carry propagation (if the final byte in the 254 string is 0x01) means we cannot use has_nul directly. The 255 easiest way to get the correct byte is to byte-swap the data 256 and calculate the syndrome a second time. */ 257 csel data1, data1, data2, ne 258 rev data1, data1 259 sub tmp1, data1, zeroones 260 orr tmp2, data1, #REP8_7f 261 bic has_nul1, tmp1, tmp2 262#else 263 csel has_nul1, has_nul1, has_nul2, ne 264#endif 265 rev has_nul1, has_nul1 266 clz pos, has_nul1 267 add tmp1, pos, #72 268 add pos, pos, #8 269 csel pos, pos, tmp1, ne 270 add src, src, pos, lsr #3 271 add dst, dst, pos, lsr #3 272 ldp data1, data2, [src, #-32] 273 stp data1, data2, [dst, #-16] 274#ifdef BUILD_STPCPY 275 sub dstin, dst, #1 276#endif 277 ret 278 279.Lpage_cross: 280 bic src, srcin, #15 281 /* Start by loading two words at [srcin & ~15], then forcing the 282 bytes that precede srcin to 0xff. This means they never look 283 like termination bytes. */ 284 ldp data1, data2, [src] 285 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ 286 tst to_align, #7 287 csetm tmp2, ne 288#ifdef __AARCH64EB__ 289 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ 290#else 291 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ 292#endif 293 orr data1, data1, tmp2 294 orr data2a, data2, tmp2 295 cmp to_align, #8 296 csinv data1, data1, xzr, lt 297 csel data2, data2, data2a, lt 298 sub tmp1, data1, zeroones 299 orr tmp2, data1, #REP8_7f 300 sub tmp3, data2, zeroones 301 orr tmp4, data2, #REP8_7f 302 bic has_nul1, tmp1, tmp2 303 bics has_nul2, tmp3, tmp4 304 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ 305 b.eq .Lpage_cross_ok 306 /* We now need to make data1 and data2 look like they've been 307 loaded directly from srcin. Do a rotate on the 128-bit value. */ 308 lsl tmp1, to_align, #3 /* Bytes->bits. */ 309 neg tmp2, to_align, lsl #3 310#ifdef __AARCH64EB__ 311 lsl data1a, data1, tmp1 312 lsr tmp4, data2, tmp2 313 lsl data2, data2, tmp1 314 orr tmp4, tmp4, data1a 315 cmp to_align, #8 316 csel data1, tmp4, data2, lt 317 rev tmp2, data1 318 rev tmp4, data2 319 sub tmp1, tmp2, zeroones 320 orr tmp2, tmp2, #REP8_7f 321 sub tmp3, tmp4, zeroones 322 orr tmp4, tmp4, #REP8_7f 323#else 324 lsr data1a, data1, tmp1 325 lsl tmp4, data2, tmp2 326 lsr data2, data2, tmp1 327 orr tmp4, tmp4, data1a 328 cmp to_align, #8 329 csel data1, tmp4, data2, lt 330 sub tmp1, data1, zeroones 331 orr tmp2, data1, #REP8_7f 332 sub tmp3, data2, zeroones 333 orr tmp4, data2, #REP8_7f 334#endif 335 bic has_nul1, tmp1, tmp2 336 cbnz has_nul1, .Lfp_le8 337 bic has_nul2, tmp3, tmp4 338 b .Lfp_gt8 339 340 .size STRCPY, . - STRCPY 341#endif 342