1/* ANSI C standard library function memcpy.
2
3   Copyright (c) 2002-2008 Tensilica Inc.
4
5   Permission is hereby granted, free of charge, to any person obtaining
6   a copy of this software and associated documentation files (the
7   "Software"), to deal in the Software without restriction, including
8   without limitation the rights to use, copy, modify, merge, publish,
9   distribute, sublicense, and/or sell copies of the Software, and to
10   permit persons to whom the Software is furnished to do so, subject to
11   the following conditions:
12
13   The above copyright notice and this permission notice shall be included
14   in all copies or substantial portions of the Software.
15
16   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19   IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20   CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22   SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
23
24#include <picolibc.h>
25
26#include "xtensa-asm.h"
27
28/* If the Xtensa Unaligned Load Exception option is not used, this
29   code can run a few cycles faster by relying on the low address bits
30   being ignored.  However, if the code is then run with an Xtensa ISS
31   client that checks for unaligned accesses, it will produce a lot of
32   warning messages.  Set this flag to disable the use of unaligned
33   accesses and keep the ISS happy.  */
34
35/* #define UNALIGNED_ADDRESSES_CHECKED XCHAL_UNALIGNED_LOAD_EXCEPTION */
36#define UNALIGNED_ADDRESSES_CHECKED 1
37
38
39/* void *memcpy (void *dst, const void *src, size_t len)
40
41   The algorithm is as follows:
42
43   If the destination is unaligned, align it by conditionally
44   copying 1- and/or 2-byte pieces.
45
46   If the source is aligned, copy 16 bytes with a loop, and then finish up
47   with 8, 4, 2, and 1-byte copies conditional on the length.
48
49   Else (if source is unaligned), do the same, but use SRC to align the
50   source data.
51
52   This code tries to use fall-through branches for the common
53   case of aligned source and destination and multiple of 4 (or 8) length.  */
54
55
56/* Byte by byte copy.  */
57
58	.text
59	.begin schedule
60	.align	XCHAL_INST_FETCH_WIDTH
61	.literal_position
62__memcpy_aux:
63
64	/* Skip bytes to get proper alignment for three-byte loop */
65.skip XCHAL_INST_FETCH_WIDTH - 3
66
67.Lbytecopy:
68#if XCHAL_HAVE_LOOPS
69	loopnez	a4, 2f
70#else
71	beqz	a4, 2f
72	add	a7, a3, a4	// a7 = end address for source
73#endif
741:	l8ui	a6, a3, 0
75	addi	a3, a3, 1
76#if XTENSA_ESP32_PSRAM_CACHE_FIX
77	nop
78	nop
79	nop
80#endif
81	s8i	a6, a5, 0
82	addi	a5, a5, 1
83#if XTENSA_ESP32_PSRAM_CACHE_FIX
84	memw
85#endif
86#if !XCHAL_HAVE_LOOPS
87	bltu	a3, a7, 1b
88#endif
892:	leaf_return
90
91
92/* Destination is unaligned.  */
93
94	.align	4
95.Ldst1mod2: // dst is only byte aligned
96
97	/* Do short copies byte-by-byte.  */
98	bltui	a4, 7, .Lbytecopy
99
100	/* Copy 1 byte.  */
101	l8ui	a6, a3, 0
102	addi	a3, a3, 1
103	addi	a4, a4, -1
104	s8i	a6, a5, 0
105#if XTENSA_ESP32_PSRAM_CACHE_FIX
106	memw
107#endif
108	addi	a5, a5, 1
109
110	/* Return to main algorithm if dst is now aligned.  */
111	bbci.l	a5, 1, .Ldstaligned
112
113.Ldst2mod4: // dst has 16-bit alignment
114
115	/* Do short copies byte-by-byte.  */
116	bltui	a4, 6, .Lbytecopy
117
118	/* Copy 2 bytes.  */
119	l8ui	a6, a3, 0
120	l8ui	a7, a3, 1
121	addi	a3, a3, 2
122	addi	a4, a4, -2
123	s8i	a6, a5, 0
124	s8i	a7, a5, 1
125#if XTENSA_ESP32_PSRAM_CACHE_FIX
126	memw
127#endif
128	addi	a5, a5, 2
129
130	/* dst is now aligned; return to main algorithm.  */
131	j	.Ldstaligned
132
133
134	.align	4
135	.global	memcpy
136	.type	memcpy, @function
137memcpy:
138	leaf_entry sp, 16
139	/* a2 = dst, a3 = src, a4 = len */
140
141	mov	a5, a2		// copy dst so that a2 is return value
142	bbsi.l	a2, 0, .Ldst1mod2
143	bbsi.l	a2, 1, .Ldst2mod4
144.Ldstaligned:
145
146	/* Get number of loop iterations with 16B per iteration.  */
147	srli	a7, a4, 4
148
149	/* Check if source is aligned.  */
150	slli 	a8, a3, 30
151	bnez	a8, .Lsrcunaligned
152
153	/* Destination and source are word-aligned, use word copy.  */
154#if XCHAL_HAVE_LOOPS
155	loopnez	a7, 2f
156#else
157	beqz	a7, 2f
158	slli	a8, a7, 4
159	add	a8, a8, a3	// a8 = end of last 16B source chunk
160#endif
161
162#if XTENSA_ESP32_PSRAM_CACHE_FIX
163
1641:	l32i	a6, a3, 0
165	l32i	a7, a3, 4
166	s32i	a6, a5, 0
167	s32i	a7, a5, 4
168	memw
169	l32i	a6, a3, 8
170	l32i	a7, a3, 12
171	s32i	a6, a5, 8
172	s32i	a7, a5, 12
173	memw
174
175	addi	a3, a3, 16
176	addi	a5, a5, 16
177
178#else
179
1801:	l32i	a6, a3, 0
181	l32i	a7, a3, 4
182	s32i	a6, a5, 0
183	l32i	a6, a3, 8
184	s32i	a7, a5, 4
185	l32i	a7, a3, 12
186	s32i	a6, a5, 8
187	addi	a3, a3, 16
188	s32i	a7, a5, 12
189	addi	a5, a5, 16
190
191#endif
192
193
194#if !XCHAL_HAVE_LOOPS
195	bltu	a3, a8, 1b
196#endif
197
198	/* Copy any leftover pieces smaller than 16B.  */
1992:	bbci.l	a4, 3, 3f
200
201	/* Copy 8 bytes.  */
202	l32i	a6, a3, 0
203	l32i	a7, a3, 4
204	addi	a3, a3, 8
205	s32i	a6, a5, 0
206	s32i	a7, a5, 4
207	addi	a5, a5, 8
208
2093:	bbsi.l	a4, 2, 4f
210	bbsi.l	a4, 1, 5f
211	bbsi.l	a4, 0, 6f
212#if XTENSA_ESP32_PSRAM_CACHE_FIX
213	memw
214#endif
215	leaf_return
216
217	.align 4
218	/* Copy 4 bytes.  */
2194:	l32i	a6, a3, 0
220	addi	a3, a3, 4
221	s32i	a6, a5, 0
222	addi	a5, a5, 4
223	bbsi.l	a4, 1, 5f
224	bbsi.l	a4, 0, 6f
225#if XTENSA_ESP32_PSRAM_CACHE_FIX
226	memw
227#endif
228	leaf_return
229
230	/* Copy 2 bytes.  */
2315:	l16ui	a6, a3, 0
232	addi	a3, a3, 2
233	s16i	a6, a5, 0
234	addi	a5, a5, 2
235	bbsi.l	a4, 0, 6f
236#if XTENSA_ESP32_PSRAM_CACHE_FIX
237	memw
238#endif
239	leaf_return
240
241	/* Copy 1 byte.  */
2426:	l8ui	a6, a3, 0
243	s8i	a6, a5, 0
244
245.Ldone:
246#if XTENSA_ESP32_PSRAM_CACHE_FIX
247	memw
248#endif
249	leaf_return
250
251
252/* Destination is aligned; source is unaligned.  */
253
254	.align	4
255.Lsrcunaligned:
256	/* Avoid loading anything for zero-length copies.  */
257	beqz	a4, .Ldone
258
259	/* Copy 16 bytes per iteration for word-aligned dst and
260	   unaligned src.  */
261	ssa8	a3		// set shift amount from byte offset
262#if UNALIGNED_ADDRESSES_CHECKED
263	srli    a11, a8, 30     // save unalignment offset for below
264	sub	a3, a3, a11	// align a3
265#endif
266	l32i	a6, a3, 0	// load first word
267#if XCHAL_HAVE_LOOPS
268	loopnez	a7, 2f
269#else
270	beqz	a7, 2f
271	slli	a10, a7, 4
272	add	a10, a10, a3	// a10 = end of last 16B source chunk
273#endif
2741:	l32i	a7, a3, 4
275	l32i	a8, a3, 8
276	src_b	a6, a6, a7
277	s32i	a6, a5, 0
278	l32i	a9, a3, 12
279	src_b	a7, a7, a8
280	s32i	a7, a5, 4
281	l32i	a6, a3, 16
282	src_b	a8, a8, a9
283	s32i	a8, a5, 8
284	addi	a3, a3, 16
285	src_b	a9, a9, a6
286	s32i	a9, a5, 12
287	addi	a5, a5, 16
288#if !XCHAL_HAVE_LOOPS
289	bltu	a3, a10, 1b
290#endif
291
2922:	bbci.l	a4, 3, 3f
293
294	/* Copy 8 bytes.  */
295	l32i	a7, a3, 4
296	l32i	a8, a3, 8
297	src_b	a6, a6, a7
298	s32i	a6, a5, 0
299	addi	a3, a3, 8
300	src_b	a7, a7, a8
301	s32i	a7, a5, 4
302	addi	a5, a5, 8
303	mov	a6, a8
304
3053:	bbci.l	a4, 2, 4f
306
307	/* Copy 4 bytes.  */
308	l32i	a7, a3, 4
309	addi	a3, a3, 4
310	src_b	a6, a6, a7
311	s32i	a6, a5, 0
312	addi	a5, a5, 4
313	mov	a6, a7
3144:
315#if UNALIGNED_ADDRESSES_CHECKED
316	add	a3, a3, a11	// readjust a3 with correct misalignment
317#endif
318	bbsi.l	a4, 1, 5f
319	bbsi.l	a4, 0, 6f
320	leaf_return
321
322	/* Copy 2 bytes.  */
3235:	l8ui	a6, a3, 0
324	l8ui	a7, a3, 1
325	addi	a3, a3, 2
326	s8i	a6, a5, 0
327	s8i	a7, a5, 1
328	addi	a5, a5, 2
329	bbsi.l	a4, 0, 6f
330#if XTENSA_ESP32_PSRAM_CACHE_FIX
331	memw
332#endif
333	leaf_return
334
335	/* Copy 1 byte.  */
3366:	l8ui	a6, a3, 0
337	s8i	a6, a5, 0
338#if XTENSA_ESP32_PSRAM_CACHE_FIX
339	memw
340#endif
341	leaf_return
342
343	.end schedule
344
345	.size	memcpy, . - memcpy
346