1/*
2   Copyright (c) 2024, Synopsys, Inc. All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6
7   1) Redistributions of source code must retain the above copyright notice,
8   this list of conditions and the following disclaimer.
9
10   2) Redistributions in binary form must reproduce the above copyright notice,
11   this list of conditions and the following disclaimer in the documentation
12   and/or other materials provided with the distribution.
13
14   3) Neither the name of the Synopsys, Inc., nor the names of its contributors
15   may be used to endorse or promote products derived from this software
16   without specific prior written permission.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28   POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include <picolibc.h>
32
33#include <sys/asm.h>
34
35; r0 void* dest
36; r1 const void* src
37; r2 size_t count
38
39; The 64-bit crunching implementation.
40
41#if defined (__ARC64_ARCH32__) && !defined(__ARC64_LL64__)
42
43ENTRY (memmove)
44
45; If the destination is greater than the source
46	cmp	r0, r1
47	ADDP	r4, r1, r2
48; or if the source plus count is smaller than the destination
49	cmp.eq r4, r0
50
51; We can safely perform a normal memcpy. Otherwise, we need to perform it
52; backwards
53	blo.d	@.L_normal_memcpy
54	lsr.f	r11, r2, 4		; counter for 16-byte chunks
55
56	ADDP	r3, r0, r2
57
58; Backwards search
59; The only thing that changes between memcpy and memmove is copy direction
60; in case the dest and src address memory locations overlap
61; More detailed information is in the forwards copy and at the end of
62; this document
63
64	ADDP	r1, r1, r2
65	bmsk_s	r2, r2, 3
66
67	bbit0.d	r2, 1, @1f
68	lsr	r5, r2, 2
69	ldh.aw	r4, [r1, -2]
70	sth.aw	r4, [r3, -2]
711:
72	bbit0.d	r2, 0, @1f
73	xor	r5, r5, 3
74	ldb.aw	r4, [r1, -1]
75	stb.aw	r4, [r3, -1]
761:
77	asl	r5, r5, 1
78	bi	[r5]
79	ld.aw	r4,[r1, -4]
80	st.aw	r4,[r3, -4]
81	ld.aw	r4,[r1, -4]
82	st.aw	r4,[r3, -4]
83	ld.aw	r4,[r1, -4]
84	st.aw	r4,[r3, -4]
85
86; Return if there are no 16 byte chunks
87	jeq	[blink]
88
89.L_write_backwards_16_bytes:
90	ld.aw	r4, [r1, -4]
91	ld.aw	r5, [r1, -4]
92	ld.aw	r6, [r1, -4]
93	ld.aw	r7, [r1, -4]
94	st.aw	r4, [r3, -4]
95	st.aw	r5, [r3, -4]
96	st.aw	r6, [r3, -4]
97	dbnz.d	r11, @.L_write_backwards_16_bytes
98	st.aw	r7, [r3, -4]
99
100	j_s	[blink]
101
102.L_normal_memcpy:
103	beq.d	@.L_write_forwards_15_bytes
104	mov	r3, r0			; work on a copy of "r0"
105
106.L_write_forwards_16_bytes:
107	ld.ab	r4, [r1, 4]
108	ld.ab	r5, [r1, 4]
109	ld.ab	r6, [r1, 4]
110	ld.ab	r7, [r1, 4]
111	st.ab	r4, [r3, 4]
112	st.ab	r5, [r3, 4]
113	st.ab	r6, [r3, 4]
114	dbnz.d	r11, @.L_write_forwards_16_bytes
115	st.ab	r7, [r3, 4]
116	bmsk_s	r2, r2, 3
117
118.L_write_forwards_15_bytes:
119	bbit0.d	r2, 1, @1f
120	lsr	r11, r2, 2
121	ldh.ab	r4, [r1, 2]
122	sth.ab	r4, [r3, 2]
1231:
124	bbit0.d	r2, 0, @1f
125	xor	r11, r11, 3
126	ldb.ab	r4, [r1, 1]
127	stb.ab	r4, [r3, 1]
1281:
129	asl	r11, r11, 1
130	bi	[r11]
131	ld.ab	r4,[r1, 4]
132	st.ab	r4,[r3, 4]
133	ld.ab	r4,[r1, 4]
134	st.ab	r4,[r3, 4]
135	ld	r4,[r1]
136	st	r4,[r3]
137
138	j_s	[blink]
139
140ENDFUNC (memmove)
141
142#else
143
144ENTRY (memmove)
145; If the destination is greater than the source
146	cmp	r0, r1
147	ADDP	r4, r1, r2
148; or if the source plus count is smaller than the destination
149	cmp.eq r4, r0
150
151; We can safely perform a normal memcpy. Otherwise, we need to perform it
152; backwards
153	blo.d	@.L_normal_memcpy
154	LSRP.f	r12, r2, 5		; counter for 32-byte chunks
155
156	ADDP	r3, r0, r2
157
158; Backwards search
159; The only thing that changes between memcpy and memmove is copy direction
160; in case the dest and src address memory locations overlap
161; More detailed information is in the forwards copy and at the end of
162; this document
163
164; Set both r0 and r1 to point to the end of each memory location
165	ADDP	r1, r1, r2
166	bmsk_s	r2, r2, 4
167
168	bbit0.d	r2, 0, @1f
169	lsr	r11, r2, 3
170	ldb.aw	r4, [r1, -1]
171	stb.aw	r4, [r3, -1]
1721:
173	bbit0.d	r2, 1, @1f
174	xor	r11, r11, 3
175	ldh.aw	r4, [r1, -2]
176	sth.aw	r4, [r3, -2]
1771:
178	bbit0.d	r2, 2, @1f
179	asl	r11, r11, 1
180	ld.aw	r4, [r1, -4]
181	st.aw	r4, [r3, -4]
1821:
183	bi	[r11]
184	LD64.aw	r4, [r1, -8]
185	ST64.aw	r4, [r3, -8]
186	LD64.aw	r4, [r1, -8]
187	ST64.aw	r4, [r3, -8]
188	LD64.aw	r4, [r1, -8]
189	ST64.aw	r4, [r3, -8]
190
191; Jump if there are no 32 byte chunks
192	jeq	[blink]
193
194.L_write_backwards_32_bytes:			; Take care of 32 byte chunks
195#if defined (__ARC64_M128__)
196
197	lddl.aw	r4r5, [r1, -16]
198	lddl.aw	r6r7, [r1, -16]
199
200	stdl.aw	r4r5, [r3, -16]
201	stdl.aw	r6r7, [r3, -16]
202	dbnz	r12, @.L_write_backwards_32_bytes
203
204#elif defined (__ARC64_ARCH64__) || (  defined (__ARC64_ARCH32__) && defined (__ARC64_LL64__) )
205
206	LD64.aw	r4, [r1, -8]
207	LD64.aw	r6, [r1, -8]
208	LD64.aw	r8, [r1, -8]
209	LD64.aw	r10,[r1, -8]
210
211	ST64.aw	r4, [r3, -8]
212	ST64.aw	r6, [r3, -8]
213	ST64.aw	r8, [r3, -8]
214	dbnz.d	r12, @.L_write_backwards_32_bytes
215	ST64.aw	r10, [r3, -8]
216
217#else
218# error Unknown configuration
219#endif
220
221	j_s	[blink]
222
223; Normal memcpy
224.L_normal_memcpy:
225	;LSRP.f	r12, r2, 5		; Moved up
226
227	beq.d	@.L_write_forwards_31_bytes
228	MOVP	r3, r0			; do not clobber the "dest"
229
230.L_write_forwards_32_bytes:			; Take care of 32 byte chunks
231#if defined (__ARC64_M128__)
232
233	lddl.ab	r4r5, [r1, +16]
234	lddl.ab	r6r7, [r1, +16]
235
236	stdl.ab	r4r5, [r3, +16]
237	stdl.ab	r6r7, [r3, +16]
238	dbnz	r12, @.L_write_forwards_32_bytes
239
240#elif defined (__ARC64_ARCH64__) || (  defined (__ARC64_ARCH32__) && defined (__ARC64_LL64__) )
241
242	LD64.ab	r4, [r1, +8]
243	LD64.ab	r6, [r1, +8]
244	LD64.ab	r8, [r1, +8]
245	LD64.ab	r10,[r1, +8]
246	ST64.ab	r4, [r3, +8]
247	ST64.ab	r6, [r3, +8]
248	ST64.ab	r8, [r3, +8]
249	dbnz.d	r12, @.L_write_forwards_32_bytes
250	ST64.ab	r10, [r3, +8]	; Shove store in delay slot
251
252#else
253# error Unknown configuration
254#endif
255
256	bmsk_s	r2, r2, 4		; From now on, we only care for the remainder % 32
257
258
259; The remainder bits indicating how many more bytes to copy
260; .------------------------.
261; | b4 | b3 | b2 | b1 | b0 |
262; `------------------------'
263;   16    8    4    2    1
264.L_write_forwards_31_bytes:
265	bbit0.d	r2, 2, @1f		; is b2 set? then copy 4 bytes
266	lsr	    r12, r2, 3		; see the notes below
267	ld.ab	r4, [r1, 4]
268	st.ab	r4, [r3, 4]
2691:
270	bbit0.d	r2, 1, @1f		; is b1 set? then copy 2 bytes
271	xor	    r12, r12, 3
272	ldh.ab	r4, [r1, 2]
273	sth.ab	r4, [r3, 2]
2741:
275	bbit0.d	r2, 0, @1f		; is b0 set? then copy 1 byte
276	asl	    r12, r12, 1
277	ldb.ab	r4, [r1, 1]
278	stb.ab	r4, [r3, 1]
279
280; Interpreting bits (b4,b3) [1] and how they correlate to branch index:
281;
282; (b4,b3) | bytes to copy | branch index
283; --------+---------------+-------------
284;   00b   |       0       |   3 (11b)
285;   01b   |       8       |   2 (10b)
286;   10b   |      16       |   1 (01b)
287;   11b   |      24       |   0 (00b)
288;
289; To go from (b4,b3) to branch index, the bits must be flipped.
290; In other words, they must be XORed with 11b [2].
291;
292; Last but not least, "bi" jumps at boundaries of 4. We need to double
293; the index to jump 8 bytes [3].
294;
295; Hence, the 3 operations for calculating the branch index that are spread
296; in "bbit0" delay slots:
297;
298;	lsr	    r12, r2,  3    [1]
299;	xor	    r12, r12, 3    [2]
300;	asl	    r12, r12, 1    [3]
3011:
302	bi	    [r12]
303	LD64.ab	r4, [r1, 8]
304	ST64.ab	r4, [r3, 8]
305	LD64.ab	r4, [r1, 8]
306	ST64.ab	r4, [r3, 8]
307	LD64.ab	r4, [r1, 8]
308	ST64.ab	r4, [r3, 8]
309
310	j_s	[blink]
311
312ENDFUNC (memmove)
313
314#endif
315