1/* 2 Copyright (c) 2024, Synopsys, Inc. All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 7 1) Redistributions of source code must retain the above copyright notice, 8 this list of conditions and the following disclaimer. 9 10 2) Redistributions in binary form must reproduce the above copyright notice, 11 this list of conditions and the following disclaimer in the documentation 12 and/or other materials provided with the distribution. 13 14 3) Neither the name of the Synopsys, Inc., nor the names of its contributors 15 may be used to endorse or promote products derived from this software 16 without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include <picolibc.h> 32 33#include <sys/asm.h> 34 35; r0 void* dest 36; r1 const void* src 37; r2 size_t count 38 39; The 64-bit crunching implementation. 40 41#if defined (__ARC64_ARCH32__) && !defined(__ARC64_LL64__) 42 43ENTRY (memmove) 44 45; If the destination is greater than the source 46 cmp r0, r1 47 ADDP r4, r1, r2 48; or if the source plus count is smaller than the destination 49 cmp.eq r4, r0 50 51; We can safely perform a normal memcpy. Otherwise, we need to perform it 52; backwards 53 blo.d @.L_normal_memcpy 54 lsr.f r11, r2, 4 ; counter for 16-byte chunks 55 56 ADDP r3, r0, r2 57 58; Backwards search 59; The only thing that changes between memcpy and memmove is copy direction 60; in case the dest and src address memory locations overlap 61; More detailed information is in the forwards copy and at the end of 62; this document 63 64 ADDP r1, r1, r2 65 bmsk_s r2, r2, 3 66 67 bbit0.d r2, 1, @1f 68 lsr r5, r2, 2 69 ldh.aw r4, [r1, -2] 70 sth.aw r4, [r3, -2] 711: 72 bbit0.d r2, 0, @1f 73 xor r5, r5, 3 74 ldb.aw r4, [r1, -1] 75 stb.aw r4, [r3, -1] 761: 77 asl r5, r5, 1 78 bi [r5] 79 ld.aw r4,[r1, -4] 80 st.aw r4,[r3, -4] 81 ld.aw r4,[r1, -4] 82 st.aw r4,[r3, -4] 83 ld.aw r4,[r1, -4] 84 st.aw r4,[r3, -4] 85 86; Return if there are no 16 byte chunks 87 jeq [blink] 88 89.L_write_backwards_16_bytes: 90 ld.aw r4, [r1, -4] 91 ld.aw r5, [r1, -4] 92 ld.aw r6, [r1, -4] 93 ld.aw r7, [r1, -4] 94 st.aw r4, [r3, -4] 95 st.aw r5, [r3, -4] 96 st.aw r6, [r3, -4] 97 dbnz.d r11, @.L_write_backwards_16_bytes 98 st.aw r7, [r3, -4] 99 100 j_s [blink] 101 102.L_normal_memcpy: 103 beq.d @.L_write_forwards_15_bytes 104 mov r3, r0 ; work on a copy of "r0" 105 106.L_write_forwards_16_bytes: 107 ld.ab r4, [r1, 4] 108 ld.ab r5, [r1, 4] 109 ld.ab r6, [r1, 4] 110 ld.ab r7, [r1, 4] 111 st.ab r4, [r3, 4] 112 st.ab r5, [r3, 4] 113 st.ab r6, [r3, 4] 114 dbnz.d r11, @.L_write_forwards_16_bytes 115 st.ab r7, [r3, 4] 116 bmsk_s r2, r2, 3 117 118.L_write_forwards_15_bytes: 119 bbit0.d r2, 1, @1f 120 lsr r11, r2, 2 121 ldh.ab r4, [r1, 2] 122 sth.ab r4, [r3, 2] 1231: 124 bbit0.d r2, 0, @1f 125 xor r11, r11, 3 126 ldb.ab r4, [r1, 1] 127 stb.ab r4, [r3, 1] 1281: 129 asl r11, r11, 1 130 bi [r11] 131 ld.ab r4,[r1, 4] 132 st.ab r4,[r3, 4] 133 ld.ab r4,[r1, 4] 134 st.ab r4,[r3, 4] 135 ld r4,[r1] 136 st r4,[r3] 137 138 j_s [blink] 139 140ENDFUNC (memmove) 141 142#else 143 144ENTRY (memmove) 145; If the destination is greater than the source 146 cmp r0, r1 147 ADDP r4, r1, r2 148; or if the source plus count is smaller than the destination 149 cmp.eq r4, r0 150 151; We can safely perform a normal memcpy. Otherwise, we need to perform it 152; backwards 153 blo.d @.L_normal_memcpy 154 LSRP.f r12, r2, 5 ; counter for 32-byte chunks 155 156 ADDP r3, r0, r2 157 158; Backwards search 159; The only thing that changes between memcpy and memmove is copy direction 160; in case the dest and src address memory locations overlap 161; More detailed information is in the forwards copy and at the end of 162; this document 163 164; Set both r0 and r1 to point to the end of each memory location 165 ADDP r1, r1, r2 166 bmsk_s r2, r2, 4 167 168 bbit0.d r2, 0, @1f 169 lsr r11, r2, 3 170 ldb.aw r4, [r1, -1] 171 stb.aw r4, [r3, -1] 1721: 173 bbit0.d r2, 1, @1f 174 xor r11, r11, 3 175 ldh.aw r4, [r1, -2] 176 sth.aw r4, [r3, -2] 1771: 178 bbit0.d r2, 2, @1f 179 asl r11, r11, 1 180 ld.aw r4, [r1, -4] 181 st.aw r4, [r3, -4] 1821: 183 bi [r11] 184 LD64.aw r4, [r1, -8] 185 ST64.aw r4, [r3, -8] 186 LD64.aw r4, [r1, -8] 187 ST64.aw r4, [r3, -8] 188 LD64.aw r4, [r1, -8] 189 ST64.aw r4, [r3, -8] 190 191; Jump if there are no 32 byte chunks 192 jeq [blink] 193 194.L_write_backwards_32_bytes: ; Take care of 32 byte chunks 195#if defined (__ARC64_M128__) 196 197 lddl.aw r4r5, [r1, -16] 198 lddl.aw r6r7, [r1, -16] 199 200 stdl.aw r4r5, [r3, -16] 201 stdl.aw r6r7, [r3, -16] 202 dbnz r12, @.L_write_backwards_32_bytes 203 204#elif defined (__ARC64_ARCH64__) || ( defined (__ARC64_ARCH32__) && defined (__ARC64_LL64__) ) 205 206 LD64.aw r4, [r1, -8] 207 LD64.aw r6, [r1, -8] 208 LD64.aw r8, [r1, -8] 209 LD64.aw r10,[r1, -8] 210 211 ST64.aw r4, [r3, -8] 212 ST64.aw r6, [r3, -8] 213 ST64.aw r8, [r3, -8] 214 dbnz.d r12, @.L_write_backwards_32_bytes 215 ST64.aw r10, [r3, -8] 216 217#else 218# error Unknown configuration 219#endif 220 221 j_s [blink] 222 223; Normal memcpy 224.L_normal_memcpy: 225 ;LSRP.f r12, r2, 5 ; Moved up 226 227 beq.d @.L_write_forwards_31_bytes 228 MOVP r3, r0 ; do not clobber the "dest" 229 230.L_write_forwards_32_bytes: ; Take care of 32 byte chunks 231#if defined (__ARC64_M128__) 232 233 lddl.ab r4r5, [r1, +16] 234 lddl.ab r6r7, [r1, +16] 235 236 stdl.ab r4r5, [r3, +16] 237 stdl.ab r6r7, [r3, +16] 238 dbnz r12, @.L_write_forwards_32_bytes 239 240#elif defined (__ARC64_ARCH64__) || ( defined (__ARC64_ARCH32__) && defined (__ARC64_LL64__) ) 241 242 LD64.ab r4, [r1, +8] 243 LD64.ab r6, [r1, +8] 244 LD64.ab r8, [r1, +8] 245 LD64.ab r10,[r1, +8] 246 ST64.ab r4, [r3, +8] 247 ST64.ab r6, [r3, +8] 248 ST64.ab r8, [r3, +8] 249 dbnz.d r12, @.L_write_forwards_32_bytes 250 ST64.ab r10, [r3, +8] ; Shove store in delay slot 251 252#else 253# error Unknown configuration 254#endif 255 256 bmsk_s r2, r2, 4 ; From now on, we only care for the remainder % 32 257 258 259; The remainder bits indicating how many more bytes to copy 260; .------------------------. 261; | b4 | b3 | b2 | b1 | b0 | 262; `------------------------' 263; 16 8 4 2 1 264.L_write_forwards_31_bytes: 265 bbit0.d r2, 2, @1f ; is b2 set? then copy 4 bytes 266 lsr r12, r2, 3 ; see the notes below 267 ld.ab r4, [r1, 4] 268 st.ab r4, [r3, 4] 2691: 270 bbit0.d r2, 1, @1f ; is b1 set? then copy 2 bytes 271 xor r12, r12, 3 272 ldh.ab r4, [r1, 2] 273 sth.ab r4, [r3, 2] 2741: 275 bbit0.d r2, 0, @1f ; is b0 set? then copy 1 byte 276 asl r12, r12, 1 277 ldb.ab r4, [r1, 1] 278 stb.ab r4, [r3, 1] 279 280; Interpreting bits (b4,b3) [1] and how they correlate to branch index: 281; 282; (b4,b3) | bytes to copy | branch index 283; --------+---------------+------------- 284; 00b | 0 | 3 (11b) 285; 01b | 8 | 2 (10b) 286; 10b | 16 | 1 (01b) 287; 11b | 24 | 0 (00b) 288; 289; To go from (b4,b3) to branch index, the bits must be flipped. 290; In other words, they must be XORed with 11b [2]. 291; 292; Last but not least, "bi" jumps at boundaries of 4. We need to double 293; the index to jump 8 bytes [3]. 294; 295; Hence, the 3 operations for calculating the branch index that are spread 296; in "bbit0" delay slots: 297; 298; lsr r12, r2, 3 [1] 299; xor r12, r12, 3 [2] 300; asl r12, r12, 1 [3] 3011: 302 bi [r12] 303 LD64.ab r4, [r1, 8] 304 ST64.ab r4, [r3, 8] 305 LD64.ab r4, [r1, 8] 306 ST64.ab r4, [r3, 8] 307 LD64.ab r4, [r1, 8] 308 ST64.ab r4, [r3, 8] 309 310 j_s [blink] 311 312ENDFUNC (memmove) 313 314#endif 315