1/* ANSI C standard library function memcpy.
2
3   Copyright (c) 2002-2008 Tensilica Inc.
4
5   Permission is hereby granted, free of charge, to any person obtaining
6   a copy of this software and associated documentation files (the
7   "Software"), to deal in the Software without restriction, including
8   without limitation the rights to use, copy, modify, merge, publish,
9   distribute, sublicense, and/or sell copies of the Software, and to
10   permit persons to whom the Software is furnished to do so, subject to
11   the following conditions:
12
13   The above copyright notice and this permission notice shall be included
14   in all copies or substantial portions of the Software.
15
16   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19   IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20   CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22   SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
23
24#include "xtensa-asm.h"
25
26/* If the Xtensa Unaligned Load Exception option is not used, this
27   code can run a few cycles faster by relying on the low address bits
28   being ignored.  However, if the code is then run with an Xtensa ISS
29   client that checks for unaligned accesses, it will produce a lot of
30   warning messages.  Set this flag to disable the use of unaligned
31   accesses and keep the ISS happy.  */
32
33/* #define UNALIGNED_ADDRESSES_CHECKED XCHAL_UNALIGNED_LOAD_EXCEPTION */
34#define UNALIGNED_ADDRESSES_CHECKED 1
35
36
37/* void *memcpy (void *dst, const void *src, size_t len)
38
39   The algorithm is as follows:
40
41   If the destination is unaligned, align it by conditionally
42   copying 1- and/or 2-byte pieces.
43
44   If the source is aligned, copy 16 bytes with a loop, and then finish up
45   with 8, 4, 2, and 1-byte copies conditional on the length.
46
47   Else (if source is unaligned), do the same, but use SRC to align the
48   source data.
49
50   This code tries to use fall-through branches for the common
51   case of aligned source and destination and multiple of 4 (or 8) length.  */
52
53
54/* Byte by byte copy.  */
55
56	.text
57	.begin schedule
58	.align	XCHAL_INST_FETCH_WIDTH
59	.literal_position
60__memcpy_aux:
61
62	/* Skip bytes to get proper alignment for three-byte loop */
63.skip XCHAL_INST_FETCH_WIDTH - 3
64
65.Lbytecopy:
66#if XCHAL_HAVE_LOOPS
67	loopnez	a4, 2f
68#else
69	beqz	a4, 2f
70	add	a7, a3, a4	// a7 = end address for source
71#endif
721:	l8ui	a6, a3, 0
73	addi	a3, a3, 1
74#if XTENSA_ESP32_PSRAM_CACHE_FIX
75	nop
76	nop
77	nop
78#endif
79	s8i	a6, a5, 0
80	addi	a5, a5, 1
81#if XTENSA_ESP32_PSRAM_CACHE_FIX
82	memw
83#endif
84#if !XCHAL_HAVE_LOOPS
85	bltu	a3, a7, 1b
86#endif
872:	leaf_return
88
89
90/* Destination is unaligned.  */
91
92	.align	4
93.Ldst1mod2: // dst is only byte aligned
94
95	/* Do short copies byte-by-byte.  */
96	bltui	a4, 7, .Lbytecopy
97
98	/* Copy 1 byte.  */
99	l8ui	a6, a3, 0
100	addi	a3, a3, 1
101	addi	a4, a4, -1
102	s8i	a6, a5, 0
103#if XTENSA_ESP32_PSRAM_CACHE_FIX
104	memw
105#endif
106	addi	a5, a5, 1
107
108	/* Return to main algorithm if dst is now aligned.  */
109	bbci.l	a5, 1, .Ldstaligned
110
111.Ldst2mod4: // dst has 16-bit alignment
112
113	/* Do short copies byte-by-byte.  */
114	bltui	a4, 6, .Lbytecopy
115
116	/* Copy 2 bytes.  */
117	l8ui	a6, a3, 0
118	l8ui	a7, a3, 1
119	addi	a3, a3, 2
120	addi	a4, a4, -2
121	s8i	a6, a5, 0
122	s8i	a7, a5, 1
123#if XTENSA_ESP32_PSRAM_CACHE_FIX
124	memw
125#endif
126	addi	a5, a5, 2
127
128	/* dst is now aligned; return to main algorithm.  */
129	j	.Ldstaligned
130
131
132	.align	4
133	.global	memcpy
134	.type	memcpy, @function
135memcpy:
136	leaf_entry sp, 16
137	/* a2 = dst, a3 = src, a4 = len */
138
139	mov	a5, a2		// copy dst so that a2 is return value
140	bbsi.l	a2, 0, .Ldst1mod2
141	bbsi.l	a2, 1, .Ldst2mod4
142.Ldstaligned:
143
144	/* Get number of loop iterations with 16B per iteration.  */
145	srli	a7, a4, 4
146
147	/* Check if source is aligned.  */
148	slli 	a8, a3, 30
149	bnez	a8, .Lsrcunaligned
150
151	/* Destination and source are word-aligned, use word copy.  */
152#if XCHAL_HAVE_LOOPS
153	loopnez	a7, 2f
154#else
155	beqz	a7, 2f
156	slli	a8, a7, 4
157	add	a8, a8, a3	// a8 = end of last 16B source chunk
158#endif
159
160#if XTENSA_ESP32_PSRAM_CACHE_FIX
161
1621:	l32i	a6, a3, 0
163	l32i	a7, a3, 4
164	s32i	a6, a5, 0
165	s32i	a7, a5, 4
166	memw
167	l32i	a6, a3, 8
168	l32i	a7, a3, 12
169	s32i	a6, a5, 8
170	s32i	a7, a5, 12
171	memw
172
173	addi	a3, a3, 16
174	addi	a5, a5, 16
175
176#else
177
1781:	l32i	a6, a3, 0
179	l32i	a7, a3, 4
180	s32i	a6, a5, 0
181	l32i	a6, a3, 8
182	s32i	a7, a5, 4
183	l32i	a7, a3, 12
184	s32i	a6, a5, 8
185	addi	a3, a3, 16
186	s32i	a7, a5, 12
187	addi	a5, a5, 16
188
189#endif
190
191
192#if !XCHAL_HAVE_LOOPS
193	bltu	a3, a8, 1b
194#endif
195
196	/* Copy any leftover pieces smaller than 16B.  */
1972:	bbci.l	a4, 3, 3f
198
199	/* Copy 8 bytes.  */
200	l32i	a6, a3, 0
201	l32i	a7, a3, 4
202	addi	a3, a3, 8
203	s32i	a6, a5, 0
204	s32i	a7, a5, 4
205	addi	a5, a5, 8
206
2073:	bbsi.l	a4, 2, 4f
208	bbsi.l	a4, 1, 5f
209	bbsi.l	a4, 0, 6f
210#if XTENSA_ESP32_PSRAM_CACHE_FIX
211	memw
212#endif
213	leaf_return
214
215	.align 4
216	/* Copy 4 bytes.  */
2174:	l32i	a6, a3, 0
218	addi	a3, a3, 4
219	s32i	a6, a5, 0
220	addi	a5, a5, 4
221	bbsi.l	a4, 1, 5f
222	bbsi.l	a4, 0, 6f
223#if XTENSA_ESP32_PSRAM_CACHE_FIX
224	memw
225#endif
226	leaf_return
227
228	/* Copy 2 bytes.  */
2295:	l16ui	a6, a3, 0
230	addi	a3, a3, 2
231	s16i	a6, a5, 0
232	addi	a5, a5, 2
233	bbsi.l	a4, 0, 6f
234#if XTENSA_ESP32_PSRAM_CACHE_FIX
235	memw
236#endif
237	leaf_return
238
239	/* Copy 1 byte.  */
2406:	l8ui	a6, a3, 0
241	s8i	a6, a5, 0
242
243.Ldone:
244#if XTENSA_ESP32_PSRAM_CACHE_FIX
245	memw
246#endif
247	leaf_return
248
249
250/* Destination is aligned; source is unaligned.  */
251
252	.align	4
253.Lsrcunaligned:
254	/* Avoid loading anything for zero-length copies.  */
255	beqz	a4, .Ldone
256
257	/* Copy 16 bytes per iteration for word-aligned dst and
258	   unaligned src.  */
259	ssa8	a3		// set shift amount from byte offset
260#if UNALIGNED_ADDRESSES_CHECKED
261	srli    a11, a8, 30     // save unalignment offset for below
262	sub	a3, a3, a11	// align a3
263#endif
264	l32i	a6, a3, 0	// load first word
265#if XCHAL_HAVE_LOOPS
266	loopnez	a7, 2f
267#else
268	beqz	a7, 2f
269	slli	a10, a7, 4
270	add	a10, a10, a3	// a10 = end of last 16B source chunk
271#endif
2721:	l32i	a7, a3, 4
273	l32i	a8, a3, 8
274	src_b	a6, a6, a7
275	s32i	a6, a5, 0
276	l32i	a9, a3, 12
277	src_b	a7, a7, a8
278	s32i	a7, a5, 4
279	l32i	a6, a3, 16
280	src_b	a8, a8, a9
281	s32i	a8, a5, 8
282	addi	a3, a3, 16
283	src_b	a9, a9, a6
284	s32i	a9, a5, 12
285	addi	a5, a5, 16
286#if !XCHAL_HAVE_LOOPS
287	bltu	a3, a10, 1b
288#endif
289
2902:	bbci.l	a4, 3, 3f
291
292	/* Copy 8 bytes.  */
293	l32i	a7, a3, 4
294	l32i	a8, a3, 8
295	src_b	a6, a6, a7
296	s32i	a6, a5, 0
297	addi	a3, a3, 8
298	src_b	a7, a7, a8
299	s32i	a7, a5, 4
300	addi	a5, a5, 8
301	mov	a6, a8
302
3033:	bbci.l	a4, 2, 4f
304
305	/* Copy 4 bytes.  */
306	l32i	a7, a3, 4
307	addi	a3, a3, 4
308	src_b	a6, a6, a7
309	s32i	a6, a5, 0
310	addi	a5, a5, 4
311	mov	a6, a7
3124:
313#if UNALIGNED_ADDRESSES_CHECKED
314	add	a3, a3, a11	// readjust a3 with correct misalignment
315#endif
316	bbsi.l	a4, 1, 5f
317	bbsi.l	a4, 0, 6f
318	leaf_return
319
320	/* Copy 2 bytes.  */
3215:	l8ui	a6, a3, 0
322	l8ui	a7, a3, 1
323	addi	a3, a3, 2
324	s8i	a6, a5, 0
325	s8i	a7, a5, 1
326	addi	a5, a5, 2
327	bbsi.l	a4, 0, 6f
328#if XTENSA_ESP32_PSRAM_CACHE_FIX
329	memw
330#endif
331	leaf_return
332
333	/* Copy 1 byte.  */
3346:	l8ui	a6, a3, 0
335	s8i	a6, a5, 0
336#if XTENSA_ESP32_PSRAM_CACHE_FIX
337	memw
338#endif
339	leaf_return
340
341	.end schedule
342
343	.size	memcpy, . - memcpy
344