1/* 2 strcpy/stpcpy - copy a string returning pointer to start/end. 3 4 Copyright (c) 2013, 2014, 2015, 2020-2023 ARM Ltd. 5 All Rights Reserved. 6 7 Redistribution and use in source and binary forms, with or without 8 modification, are permitted provided that the following conditions are met: 9 * Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 * Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 * Neither the name of the company nor the names of its contributors 15 may be used to endorse or promote products derived from this 16 software without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ 29#include <picolibc.h> 30 31#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) || !defined(__ARM_NEON) 32/* See strcpy-stub.c */ 33#else 34 35/* Assumptions: 36 * 37 * ARMv8-a, AArch64, Advanced SIMD. 38 * MTE compatible. 39 */ 40 41#include "asmdefs.h" 42 43#define dstin x0 44#define srcin x1 45#define result x0 46 47#define src x2 48#define dst x3 49#define len x4 50#define synd x4 51#define tmp x5 52#define shift x5 53#define data1 x6 54#define dataw1 w6 55#define data2 x7 56#define dataw2 w7 57 58#define dataq q0 59#define vdata v0 60#define vhas_nul v1 61#define vend v2 62#define dend d2 63#define dataq2 q1 64 65#ifdef BUILD_STPCPY 66# define STRCPY stpcpy 67# define IFSTPCPY(X,...) X,__VA_ARGS__ 68#else 69# define STRCPY strcpy 70# define IFSTPCPY(X,...) 71#endif 72 73/* 74 Core algorithm: 75 For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits 76 per byte. We take 4 bits of every comparison byte with shift right and narrow 77 by 4 instruction. Since the bits in the nibble mask reflect the order in 78 which things occur in the original string, counting leading zeros identifies 79 exactly which byte matched. */ 80 81ENTRY (STRCPY) 82 PTR_ARG (0) 83 PTR_ARG (1) 84 bic src, srcin, 15 85 ld1 {vdata.16b}, [src] 86 cmeq vhas_nul.16b, vdata.16b, 0 87 lsl shift, srcin, 2 88 shrn vend.8b, vhas_nul.8h, 4 89 fmov synd, dend 90 lsr synd, synd, shift 91 cbnz synd, L(tail) 92 93 ldr dataq, [src, 16]! 94 cmeq vhas_nul.16b, vdata.16b, 0 95 shrn vend.8b, vhas_nul.8h, 4 96 fmov synd, dend 97 cbz synd, L(start_loop) 98 99#ifndef __AARCH64EB__ 100 rbit synd, synd 101#endif 102 sub tmp, src, srcin 103 clz len, synd 104 add len, tmp, len, lsr 2 105 tbz len, 4, L(less16) 106 sub tmp, len, 15 107 ldr dataq, [srcin] 108 ldr dataq2, [srcin, tmp] 109 str dataq, [dstin] 110 str dataq2, [dstin, tmp] 111 IFSTPCPY (add result, dstin, len) 112 ret 113 114L(tail): 115 rbit synd, synd 116 clz len, synd 117 lsr len, len, 2 118L(less16): 119 tbz len, 3, L(less8) 120 sub tmp, len, 7 121 ldr data1, [srcin] 122 ldr data2, [srcin, tmp] 123 str data1, [dstin] 124 str data2, [dstin, tmp] 125 IFSTPCPY (add result, dstin, len) 126 ret 127 128 .p2align 4 129L(less8): 130 subs tmp, len, 3 131 b.lo L(less4) 132 ldr dataw1, [srcin] 133 ldr dataw2, [srcin, tmp] 134 str dataw1, [dstin] 135 str dataw2, [dstin, tmp] 136 IFSTPCPY (add result, dstin, len) 137 ret 138 139L(less4): 140 cbz len, L(zerobyte) 141 ldrh dataw1, [srcin] 142 strh dataw1, [dstin] 143L(zerobyte): 144 strb wzr, [dstin, len] 145 IFSTPCPY (add result, dstin, len) 146 ret 147 148 .p2align 4 149L(start_loop): 150 sub tmp, srcin, dstin 151 ldr dataq2, [srcin] 152 sub dst, src, tmp 153 str dataq2, [dstin] 154L(loop): 155 str dataq, [dst], 32 156 ldr dataq, [src, 16] 157 cmeq vhas_nul.16b, vdata.16b, 0 158 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 159 fmov synd, dend 160 cbnz synd, L(loopend) 161 str dataq, [dst, -16] 162 ldr dataq, [src, 32]! 163 cmeq vhas_nul.16b, vdata.16b, 0 164 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 165 fmov synd, dend 166 cbz synd, L(loop) 167 add dst, dst, 16 168L(loopend): 169 shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ 170 fmov synd, dend 171 sub dst, dst, 31 172#ifndef __AARCH64EB__ 173 rbit synd, synd 174#endif 175 clz len, synd 176 lsr len, len, 2 177 add dst, dst, len 178 ldr dataq, [dst, tmp] 179 str dataq, [dst] 180 IFSTPCPY (add result, dst, 15) 181 ret 182 183END (STRCPY) 184#endif 185