1/*
2   Copyright (c) 2015-2024, Synopsys, Inc. All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6
7   1) Redistributions of source code must retain the above copyright notice,
8   this list of conditions and the following disclaimer.
9
10   2) Redistributions in binary form must reproduce the above copyright notice,
11   this list of conditions and the following disclaimer in the documentation
12   and/or other materials provided with the distribution.
13
14   3) Neither the name of the Synopsys, Inc., nor the names of its contributors
15   may be used to endorse or promote products derived from this software
16   without specific prior written permission.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28   POSSIBILITY OF SUCH DAMAGE.
29*/
30
31/* This implementation is optimized for performance.  For code size a generic
32   implementation of this function from newlib/libc/string/memcpy.c will be
33   used.  */
34#include <picolibc.h>
35
36#if !defined (__OPTIMIZE_SIZE__) && !defined (PREFER_SIZE_OVER_SPEED) \
37    && !defined (__ARC_RF16__)
38
39#include "asm.h"
40
41#if defined (__ARCHS__)
42
43#ifdef __LITTLE_ENDIAN__
44# define SHIFT_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<
45# define SHIFT_2(RX,RY,IMM)	lsr	RX, RY, IMM	; >>
46# define MERGE_1(RX,RY,IMM)	asl	RX, RY, IMM
47# define MERGE_2(RX,RY,IMM)
48# define EXTRACT_1(RX,RY,IMM)	and	RX, RY, 0xFFFF
49# define EXTRACT_2(RX,RY,IMM)	lsr	RX, RY, IMM
50#else
51# define SHIFT_1(RX,RY,IMM)	lsr	RX, RY, IMM	; >>
52# define SHIFT_2(RX,RY,IMM)	asl	RX, RY, IMM	; <<
53# define MERGE_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<
54# define MERGE_2(RX,RY,IMM)	asl	RX, RY, IMM	; <<
55# define EXTRACT_1(RX,RY,IMM)	lsr	RX, RY, IMM
56# define EXTRACT_2(RX,RY,IMM)	lsr	RX, RY, 0x08
57#endif
58
59#ifdef __ARC_LL64__
60# define PREFETCH_READ(RX)	prefetch	[RX, 56]
61# define PREFETCH_WRITE(RX)	prefetchw	[RX, 64]
62# define LOADX(DST,RX)		ldd.ab	DST, [RX, 8]
63# define STOREX(SRC,RX)		std.ab	SRC, [RX, 8]
64# define ZOLSHFT		5
65# define ZOLAND			0x1F
66#else
67# define PREFETCH_READ(RX)	prefetch	[RX, 28]
68# define PREFETCH_WRITE(RX)	prefetchw	[RX, 32]
69# define LOADX(DST,RX)		ld.ab	DST, [RX, 4]
70# define STOREX(SRC,RX)		st.ab	SRC, [RX, 4]
71# define ZOLSHFT		4
72# define ZOLAND			0xF
73#endif
74
75
76;;; MEMCPY copy memory regions
77;;; Input arguments:
78;;;   r0 - output memory region
79;;;   r1 - input memory region
80;;;   r2 - size in bytes
81;;; Returns:
82;;;   r0 - pointer to the first byte of the output region
83;;; Clobber:
84;;;   r1, r2, r3, r4, r5, r6, r8r9, r10r11, lp_count
85
86#if !defined (__ARC_UNALIGNED__)
87
88;;; MEMCPY routine for the case when the CPU only accepts ALIGNED
89;;; accesses to memory.
90ENTRY (memcpy)
91	prefetch  [r1]		; Prefetch the read location
92	prefetchw [r0]		; Prefetch the write location
93	mov.f	0, r2
94; if size is zero
95	jz.d	[blink]
96	mov	r3, r0		; don't clobber ret val
97
98; if size <= 8
99	cmp	r2, 8
100	bls.d	.Lsmallchunk
101	mov.f	lp_count, r2
102
103	and.f	r4, r0, 0x03
104	rsub	lp_count, r4, 4
105	lpnz	.Laligndestination
106	; LOOP BEGIN
107	ldb.ab	r5, [r1,1]
108	sub	r2, r2, 1
109	stb.ab	r5, [r3,1]
110.Laligndestination:
111
112; Check the alignment of the source
113	and.f	r4, r1, 0x03
114	bnz.d	.Lsourceunaligned
115
116; CASE 0: Both source and destination are 32bit aligned
117; Convert len to Dwords, unfold x4
118	lsr.f	lp_count, r2, ZOLSHFT
119	lpnz	.Lcopy32_64bytes
120	; LOOP START
121	LOADX (r6, r1)
122	PREFETCH_READ (r1)
123	PREFETCH_WRITE (r3)
124	LOADX (r8, r1)
125	LOADX (r10, r1)
126	LOADX (r4, r1)
127	STOREX (r6, r3)
128	STOREX (r8, r3)
129	STOREX (r10, r3)
130	STOREX (r4, r3)
131.Lcopy32_64bytes:
132
133	and.f	lp_count, r2, ZOLAND ;Last remaining 31 bytes
134.Lsmallchunk:
135	lpnz	.Lcopyremainingbytes
136	; LOOP START
137	ldb.ab	r5, [r1,1]
138	stb.ab	r5, [r3,1]
139.Lcopyremainingbytes:
140
141	j	[blink]
142; END CASE 0
143
144.Lsourceunaligned:
145	cmp	r4, 2
146	beq.d	.LunalignedOffby2
147	sub	r2, r2, 1
148
149	bhi.d	.LunalignedOffby3
150	ldb.ab	r5, [r1, 1]
151
152; CASE 1: The source is unaligned, off by 1
153	; Hence I need to read 1 byte for a 16bit alignment
154	; and 2bytes to reach 32bit alignment
155	ldh.ab	r6, [r1, 2]
156	sub	r2, r2, 2
157	; Convert to words, unfold x2
158	lsr.f	lp_count, r2, 3
159	MERGE_1 (r6, r6, 8)
160	MERGE_2 (r5, r5, 24)
161	or	r5, r5, r6
162
163	; Both src and dst are aligned
164	lpnz	.Lcopy8bytes_1
165	; LOOP START
166	ld.ab	r6, [r1, 4]
167	prefetch [r1, 28]	;Prefetch the next read location
168	ld.ab	r8, [r1,4]
169	prefetchw [r3, 32]	;Prefetch the next write location
170
171	SHIFT_1	(r7, r6, 24)
172	or	r7, r7, r5
173	SHIFT_2	(r5, r6, 8)
174
175	SHIFT_1	(r9, r8, 24)
176	or	r9, r9, r5
177	SHIFT_2	(r5, r8, 8)
178
179	st.ab	r7, [r3, 4]
180	st.ab	r9, [r3, 4]
181.Lcopy8bytes_1:
182
183	; Write back the remaining 16bits
184	EXTRACT_1 (r6, r5, 16)
185	sth.ab	r6, [r3, 2]
186	; Write back the remaining 8bits
187	EXTRACT_2 (r5, r5, 16)
188	stb.ab	r5, [r3, 1]
189
190	and.f	lp_count, r2, 0x07 ;Last 8bytes
191	lpnz	.Lcopybytewise_1
192	; LOOP START
193	ldb.ab	r6, [r1,1]
194	stb.ab	r6, [r3,1]
195.Lcopybytewise_1:
196	j	[blink]
197
198.LunalignedOffby2:
199; CASE 2: The source is unaligned, off by 2
200	ldh.ab	r5, [r1, 2]
201	sub	r2, r2, 1
202
203	; Both src and dst are aligned
204	; Convert to words, unfold x2
205	lsr.f	lp_count, r2, 3
206#ifdef __BIG_ENDIAN__
207	asl.nz	r5, r5, 16
208#endif
209	lpnz	.Lcopy8bytes_2
210	; LOOP START
211	ld.ab	r6, [r1, 4]
212	prefetch [r1, 28]	;Prefetch the next read location
213	ld.ab	r8, [r1,4]
214	prefetchw [r3, 32]	;Prefetch the next write location
215
216	SHIFT_1	(r7, r6, 16)
217	or	r7, r7, r5
218	SHIFT_2	(r5, r6, 16)
219
220	SHIFT_1	(r9, r8, 16)
221	or	r9, r9, r5
222	SHIFT_2	(r5, r8, 16)
223
224	st.ab	r7, [r3, 4]
225	st.ab	r9, [r3, 4]
226.Lcopy8bytes_2:
227
228#ifdef __BIG_ENDIAN__
229	lsr.nz	r5, r5, 16
230#endif
231	sth.ab	r5, [r3, 2]
232
233	and.f	lp_count, r2, 0x07 ;Last 8bytes
234	lpnz	.Lcopybytewise_2
235	; LOOP START
236	ldb.ab	r6, [r1,1]
237	stb.ab	r6, [r3,1]
238.Lcopybytewise_2:
239	j	[blink]
240
241.LunalignedOffby3:
242; CASE 3: The source is unaligned, off by 3
243; Hence, I need to read 1byte for achieve the 32bit alignment
244
245	; Both src and dst are aligned
246	; Convert to words, unfold x2
247	lsr.f	lp_count, r2, 3
248#ifdef __BIG_ENDIAN__
249	asl.ne	r5, r5, 24
250#endif
251	lpnz	.Lcopy8bytes_3
252	; LOOP START
253	ld.ab	r6, [r1, 4]
254	prefetch [r1, 28]	;Prefetch the next read location
255	ld.ab	r8, [r1,4]
256	prefetchw [r3, 32]	;Prefetch the next write location
257
258	SHIFT_1	(r7, r6, 8)
259	or	r7, r7, r5
260	SHIFT_2	(r5, r6, 24)
261
262	SHIFT_1	(r9, r8, 8)
263	or	r9, r9, r5
264	SHIFT_2	(r5, r8, 24)
265
266	st.ab	r7, [r3, 4]
267	st.ab	r9, [r3, 4]
268.Lcopy8bytes_3:
269
270#ifdef __BIG_ENDIAN__
271	lsr.nz	r5, r5, 24
272#endif
273	stb.ab	r5, [r3, 1]
274
275	and.f	lp_count, r2, 0x07 ;Last 8bytes
276	lpnz	.Lcopybytewise_3
277	; LOOP START
278	ldb.ab	r6, [r1,1]
279	stb.ab	r6, [r3,1]
280.Lcopybytewise_3:
281	j	[blink]
282
283ENDFUNC (memcpy)
284
285#else
286
287;;; MEMCPY routine which is used by systems with unaligned memory
288;;; accesses.  This is the case for most of ARCHS CPU family.
289ENTRY(memcpy)
290	prefetch  [r1]		; Prefetch the read location
291	prefetchw [r0]		; Prefetch the write location
292	mov.f	0, r2
293;;; if size is zero
294	jz.d	[blink]
295	mov	r3, r0		; don't clobber ret val
296
297;;; if size <= 8
298	cmp	r2, 8
299	bls.d	.Lsmallchunk
300	mov.f	lp_count, r2
301
302;;; Convert len to Dwords, unfold x4
303	lsr.f	lp_count, r2, ZOLSHFT
304	lpnz	.Lcopyfast
305	;; LOOP START
306	LOADX (r6, r1)
307	PREFETCH_READ (r1)
308	PREFETCH_WRITE (r3)
309	LOADX (r8, r1)
310	LOADX (r10, r1)
311	LOADX (r4, r1)
312	STOREX (r6, r3)
313	STOREX (r8, r3)
314	STOREX (r10, r3)
315	STOREX (r4, r3)
316.Lcopyfast:
317
318#ifdef __ARC_LL64__
319	and     r2, r2, ZOLAND	;Remaining 31 bytes
320	lsr.f   lp_count, r2, 3	;Convert to 64-bit words.
321	lpnz	.Lcopy64b
322	;; LOOP START
323	ldd.ab	r6,[r1,8]
324	std.ab	r6,[r3,8]
325.Lcopy64b:
326
327	and.f	lp_count, r2, 0x07 ; Last 7 bytes
328#else
329	and.f	lp_count, r2, ZOLAND
330#endif
331
332.Lsmallchunk:
333	lpnz	.Lcopyremainingbytes
334	;; LOOP START
335	ldb.ab	r5, [r1,1]
336	stb.ab	r5, [r3,1]
337.Lcopyremainingbytes:
338
339	j	[blink]
340
341ENDFUNC(memcpy)
342#endif
343
344#endif /* __ARCHS__ */
345
346#endif /* !__OPTIMIZE_SIZE__ && !PREFER_SIZE_OVER_SPEED */
347