1/* 2 Copyright (c) 2015-2024, Synopsys, Inc. All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 7 1) Redistributions of source code must retain the above copyright notice, 8 this list of conditions and the following disclaimer. 9 10 2) Redistributions in binary form must reproduce the above copyright notice, 11 this list of conditions and the following disclaimer in the documentation 12 and/or other materials provided with the distribution. 13 14 3) Neither the name of the Synopsys, Inc., nor the names of its contributors 15 may be used to endorse or promote products derived from this software 16 without specific prior written permission. 17 18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 22 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31/* This implementation is optimized for performance. For code size a generic 32 implementation of this function from newlib/libc/string/memcpy.c will be 33 used. */ 34#include <picolibc.h> 35 36#if !defined (__OPTIMIZE_SIZE__) && !defined (PREFER_SIZE_OVER_SPEED) \ 37 && !defined (__ARC_RF16__) 38 39#include "asm.h" 40 41#if defined (__ARCHS__) 42 43#ifdef __LITTLE_ENDIAN__ 44# define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; << 45# define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >> 46# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM 47# define MERGE_2(RX,RY,IMM) 48# define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF 49# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM 50#else 51# define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >> 52# define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; << 53# define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; << 54# define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; << 55# define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM 56# define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08 57#endif 58 59#ifdef __ARC_LL64__ 60# define PREFETCH_READ(RX) prefetch [RX, 56] 61# define PREFETCH_WRITE(RX) prefetchw [RX, 64] 62# define LOADX(DST,RX) ldd.ab DST, [RX, 8] 63# define STOREX(SRC,RX) std.ab SRC, [RX, 8] 64# define ZOLSHFT 5 65# define ZOLAND 0x1F 66#else 67# define PREFETCH_READ(RX) prefetch [RX, 28] 68# define PREFETCH_WRITE(RX) prefetchw [RX, 32] 69# define LOADX(DST,RX) ld.ab DST, [RX, 4] 70# define STOREX(SRC,RX) st.ab SRC, [RX, 4] 71# define ZOLSHFT 4 72# define ZOLAND 0xF 73#endif 74 75 76;;; MEMCPY copy memory regions 77;;; Input arguments: 78;;; r0 - output memory region 79;;; r1 - input memory region 80;;; r2 - size in bytes 81;;; Returns: 82;;; r0 - pointer to the first byte of the output region 83;;; Clobber: 84;;; r1, r2, r3, r4, r5, r6, r8r9, r10r11, lp_count 85 86#if !defined (__ARC_UNALIGNED__) 87 88;;; MEMCPY routine for the case when the CPU only accepts ALIGNED 89;;; accesses to memory. 90ENTRY (memcpy) 91 prefetch [r1] ; Prefetch the read location 92 prefetchw [r0] ; Prefetch the write location 93 mov.f 0, r2 94; if size is zero 95 jz.d [blink] 96 mov r3, r0 ; don't clobber ret val 97 98; if size <= 8 99 cmp r2, 8 100 bls.d .Lsmallchunk 101 mov.f lp_count, r2 102 103 and.f r4, r0, 0x03 104 rsub lp_count, r4, 4 105 lpnz .Laligndestination 106 ; LOOP BEGIN 107 ldb.ab r5, [r1,1] 108 sub r2, r2, 1 109 stb.ab r5, [r3,1] 110.Laligndestination: 111 112; Check the alignment of the source 113 and.f r4, r1, 0x03 114 bnz.d .Lsourceunaligned 115 116; CASE 0: Both source and destination are 32bit aligned 117; Convert len to Dwords, unfold x4 118 lsr.f lp_count, r2, ZOLSHFT 119 lpnz .Lcopy32_64bytes 120 ; LOOP START 121 LOADX (r6, r1) 122 PREFETCH_READ (r1) 123 PREFETCH_WRITE (r3) 124 LOADX (r8, r1) 125 LOADX (r10, r1) 126 LOADX (r4, r1) 127 STOREX (r6, r3) 128 STOREX (r8, r3) 129 STOREX (r10, r3) 130 STOREX (r4, r3) 131.Lcopy32_64bytes: 132 133 and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes 134.Lsmallchunk: 135 lpnz .Lcopyremainingbytes 136 ; LOOP START 137 ldb.ab r5, [r1,1] 138 stb.ab r5, [r3,1] 139.Lcopyremainingbytes: 140 141 j [blink] 142; END CASE 0 143 144.Lsourceunaligned: 145 cmp r4, 2 146 beq.d .LunalignedOffby2 147 sub r2, r2, 1 148 149 bhi.d .LunalignedOffby3 150 ldb.ab r5, [r1, 1] 151 152; CASE 1: The source is unaligned, off by 1 153 ; Hence I need to read 1 byte for a 16bit alignment 154 ; and 2bytes to reach 32bit alignment 155 ldh.ab r6, [r1, 2] 156 sub r2, r2, 2 157 ; Convert to words, unfold x2 158 lsr.f lp_count, r2, 3 159 MERGE_1 (r6, r6, 8) 160 MERGE_2 (r5, r5, 24) 161 or r5, r5, r6 162 163 ; Both src and dst are aligned 164 lpnz .Lcopy8bytes_1 165 ; LOOP START 166 ld.ab r6, [r1, 4] 167 prefetch [r1, 28] ;Prefetch the next read location 168 ld.ab r8, [r1,4] 169 prefetchw [r3, 32] ;Prefetch the next write location 170 171 SHIFT_1 (r7, r6, 24) 172 or r7, r7, r5 173 SHIFT_2 (r5, r6, 8) 174 175 SHIFT_1 (r9, r8, 24) 176 or r9, r9, r5 177 SHIFT_2 (r5, r8, 8) 178 179 st.ab r7, [r3, 4] 180 st.ab r9, [r3, 4] 181.Lcopy8bytes_1: 182 183 ; Write back the remaining 16bits 184 EXTRACT_1 (r6, r5, 16) 185 sth.ab r6, [r3, 2] 186 ; Write back the remaining 8bits 187 EXTRACT_2 (r5, r5, 16) 188 stb.ab r5, [r3, 1] 189 190 and.f lp_count, r2, 0x07 ;Last 8bytes 191 lpnz .Lcopybytewise_1 192 ; LOOP START 193 ldb.ab r6, [r1,1] 194 stb.ab r6, [r3,1] 195.Lcopybytewise_1: 196 j [blink] 197 198.LunalignedOffby2: 199; CASE 2: The source is unaligned, off by 2 200 ldh.ab r5, [r1, 2] 201 sub r2, r2, 1 202 203 ; Both src and dst are aligned 204 ; Convert to words, unfold x2 205 lsr.f lp_count, r2, 3 206#ifdef __BIG_ENDIAN__ 207 asl.nz r5, r5, 16 208#endif 209 lpnz .Lcopy8bytes_2 210 ; LOOP START 211 ld.ab r6, [r1, 4] 212 prefetch [r1, 28] ;Prefetch the next read location 213 ld.ab r8, [r1,4] 214 prefetchw [r3, 32] ;Prefetch the next write location 215 216 SHIFT_1 (r7, r6, 16) 217 or r7, r7, r5 218 SHIFT_2 (r5, r6, 16) 219 220 SHIFT_1 (r9, r8, 16) 221 or r9, r9, r5 222 SHIFT_2 (r5, r8, 16) 223 224 st.ab r7, [r3, 4] 225 st.ab r9, [r3, 4] 226.Lcopy8bytes_2: 227 228#ifdef __BIG_ENDIAN__ 229 lsr.nz r5, r5, 16 230#endif 231 sth.ab r5, [r3, 2] 232 233 and.f lp_count, r2, 0x07 ;Last 8bytes 234 lpnz .Lcopybytewise_2 235 ; LOOP START 236 ldb.ab r6, [r1,1] 237 stb.ab r6, [r3,1] 238.Lcopybytewise_2: 239 j [blink] 240 241.LunalignedOffby3: 242; CASE 3: The source is unaligned, off by 3 243; Hence, I need to read 1byte for achieve the 32bit alignment 244 245 ; Both src and dst are aligned 246 ; Convert to words, unfold x2 247 lsr.f lp_count, r2, 3 248#ifdef __BIG_ENDIAN__ 249 asl.ne r5, r5, 24 250#endif 251 lpnz .Lcopy8bytes_3 252 ; LOOP START 253 ld.ab r6, [r1, 4] 254 prefetch [r1, 28] ;Prefetch the next read location 255 ld.ab r8, [r1,4] 256 prefetchw [r3, 32] ;Prefetch the next write location 257 258 SHIFT_1 (r7, r6, 8) 259 or r7, r7, r5 260 SHIFT_2 (r5, r6, 24) 261 262 SHIFT_1 (r9, r8, 8) 263 or r9, r9, r5 264 SHIFT_2 (r5, r8, 24) 265 266 st.ab r7, [r3, 4] 267 st.ab r9, [r3, 4] 268.Lcopy8bytes_3: 269 270#ifdef __BIG_ENDIAN__ 271 lsr.nz r5, r5, 24 272#endif 273 stb.ab r5, [r3, 1] 274 275 and.f lp_count, r2, 0x07 ;Last 8bytes 276 lpnz .Lcopybytewise_3 277 ; LOOP START 278 ldb.ab r6, [r1,1] 279 stb.ab r6, [r3,1] 280.Lcopybytewise_3: 281 j [blink] 282 283ENDFUNC (memcpy) 284 285#else 286 287;;; MEMCPY routine which is used by systems with unaligned memory 288;;; accesses. This is the case for most of ARCHS CPU family. 289ENTRY(memcpy) 290 prefetch [r1] ; Prefetch the read location 291 prefetchw [r0] ; Prefetch the write location 292 mov.f 0, r2 293;;; if size is zero 294 jz.d [blink] 295 mov r3, r0 ; don't clobber ret val 296 297;;; if size <= 8 298 cmp r2, 8 299 bls.d .Lsmallchunk 300 mov.f lp_count, r2 301 302;;; Convert len to Dwords, unfold x4 303 lsr.f lp_count, r2, ZOLSHFT 304 lpnz .Lcopyfast 305 ;; LOOP START 306 LOADX (r6, r1) 307 PREFETCH_READ (r1) 308 PREFETCH_WRITE (r3) 309 LOADX (r8, r1) 310 LOADX (r10, r1) 311 LOADX (r4, r1) 312 STOREX (r6, r3) 313 STOREX (r8, r3) 314 STOREX (r10, r3) 315 STOREX (r4, r3) 316.Lcopyfast: 317 318#ifdef __ARC_LL64__ 319 and r2, r2, ZOLAND ;Remaining 31 bytes 320 lsr.f lp_count, r2, 3 ;Convert to 64-bit words. 321 lpnz .Lcopy64b 322 ;; LOOP START 323 ldd.ab r6,[r1,8] 324 std.ab r6,[r3,8] 325.Lcopy64b: 326 327 and.f lp_count, r2, 0x07 ; Last 7 bytes 328#else 329 and.f lp_count, r2, ZOLAND 330#endif 331 332.Lsmallchunk: 333 lpnz .Lcopyremainingbytes 334 ;; LOOP START 335 ldb.ab r5, [r1,1] 336 stb.ab r5, [r3,1] 337.Lcopyremainingbytes: 338 339 j [blink] 340 341ENDFUNC(memcpy) 342#endif 343 344#endif /* __ARCHS__ */ 345 346#endif /* !__OPTIMIZE_SIZE__ && !PREFER_SIZE_OVER_SPEED */ 347