1/* ANSI C standard library function memcpy.
2
3   Copyright (c) 2002-2008 Tensilica Inc.
4
5   Permission is hereby granted, free of charge, to any person obtaining
6   a copy of this software and associated documentation files (the
7   "Software"), to deal in the Software without restriction, including
8   without limitation the rights to use, copy, modify, merge, publish,
9   distribute, sublicense, and/or sell copies of the Software, and to
10   permit persons to whom the Software is furnished to do so, subject to
11   the following conditions:
12
13   The above copyright notice and this permission notice shall be included
14   in all copies or substantial portions of the Software.
15
16   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19   IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20   CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22   SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
23
24#include "xtensa-asm.h"
25
26/* If the Xtensa Unaligned Load Exception option is not used, this
27   code can run a few cycles faster by relying on the low address bits
28   being ignored.  However, if the code is then run with an Xtensa ISS
29   client that checks for unaligned accesses, it will produce a lot of
30   warning messages.  Set this flag to disable the use of unaligned
31   accesses and keep the ISS happy.  */
32
33#if XCHAL_UNALIGNED_LOAD_EXCEPTION || 1
34#define UNALIGNED_ADDRESSES_CHECKED 1
35#endif
36
37
38/* void *memcpy (void *dst, const void *src, size_t len)
39
40   The algorithm is as follows:
41
42   If the destination is unaligned, align it by conditionally
43   copying 1- and/or 2-byte pieces.
44
45   If the source is aligned, copy 16 bytes with a loop, and then finish up
46   with 8, 4, 2, and 1-byte copies conditional on the length.
47
48   Else (if source is unaligned), do the same, but use SRC to align the
49   source data.
50
51   This code tries to use fall-through branches for the common
52   case of aligned source and destination and multiple of 4 (or 8) length.  */
53
54
55/* Byte by byte copy.  */
56
57	.text
58	.begin schedule
59	.align	XCHAL_INST_FETCH_WIDTH
60	.literal_position
61__memcpy_aux:
62
63	/* Skip bytes to get proper alignment for three-byte loop */
64.skip XCHAL_INST_FETCH_WIDTH - 3
65
66.Lbytecopy:
67#if XCHAL_HAVE_LOOPS
68	loopnez	a4, 2f
69#else
70	beqz	a4, 2f
71	add	a7, a3, a4	// a7 = end address for source
72#endif
731:	l8ui	a6, a3, 0
74	addi	a3, a3, 1
75#ifdef PSRAM_FIX
76	nop
77	nop
78	nop
79#endif
80	s8i	a6, a5, 0
81	addi	a5, a5, 1
82#ifdef PSRAM_FIX
83	memw
84#endif
85#if !XCHAL_HAVE_LOOPS
86	bltu	a3, a7, 1b
87#endif
882:	leaf_return
89
90
91/* Destination is unaligned.  */
92
93	.align	4
94.Ldst1mod2: // dst is only byte aligned
95
96	/* Do short copies byte-by-byte.  */
97	bltui	a4, 7, .Lbytecopy
98
99	/* Copy 1 byte.  */
100	l8ui	a6, a3, 0
101	addi	a3, a3, 1
102	addi	a4, a4, -1
103	s8i	a6, a5, 0
104#ifdef PSRAM_FIX
105	memw
106#endif
107	addi	a5, a5, 1
108
109	/* Return to main algorithm if dst is now aligned.  */
110	bbci.l	a5, 1, .Ldstaligned
111
112.Ldst2mod4: // dst has 16-bit alignment
113
114	/* Do short copies byte-by-byte.  */
115	bltui	a4, 6, .Lbytecopy
116
117	/* Copy 2 bytes.  */
118	l8ui	a6, a3, 0
119	l8ui	a7, a3, 1
120	addi	a3, a3, 2
121	addi	a4, a4, -2
122	s8i	a6, a5, 0
123	s8i	a7, a5, 1
124#ifdef PSRAM_FIX
125	memw
126#endif
127	addi	a5, a5, 2
128
129	/* dst is now aligned; return to main algorithm.  */
130	j	.Ldstaligned
131
132
133	.align	4
134	.global	memcpy
135	.type	memcpy, @function
136memcpy:
137	leaf_entry sp, 16
138	/* a2 = dst, a3 = src, a4 = len */
139
140	mov	a5, a2		// copy dst so that a2 is return value
141	bbsi.l	a2, 0, .Ldst1mod2
142	bbsi.l	a2, 1, .Ldst2mod4
143.Ldstaligned:
144
145	/* Get number of loop iterations with 16B per iteration.  */
146	srli	a7, a4, 4
147
148	/* Check if source is aligned.  */
149	slli 	a8, a3, 30
150	bnez	a8, .Lsrcunaligned
151
152	/* Destination and source are word-aligned, use word copy.  */
153#if XCHAL_HAVE_LOOPS
154	loopnez	a7, 2f
155#else
156	beqz	a7, 2f
157	slli	a8, a7, 4
158	add	a8, a8, a3	// a8 = end of last 16B source chunk
159#endif
160
161#ifndef PSRAM_FIX
162
1631:	l32i	a6, a3, 0 //HYP: memw after this is fix-ish?
164	l32i	a7, a3, 4
165	s32i	a6, a5, 0
166	l32i	a6, a3, 8
167	s32i	a7, a5, 4
168	l32i	a7, a3, 12
169	s32i	a6, a5, 8
170	addi	a3, a3, 16
171	s32i	a7, a5, 12
172	addi	a5, a5, 16
173
174#else
1751:	l32i	a6, a3, 0
176	l32i	a7, a3, 4
177	s32i	a6, a5, 0
178	s32i	a7, a5, 4
179	memw
180	l32i	a6, a3, 8
181	l32i	a7, a3, 12
182	s32i	a6, a5, 8
183	s32i	a7, a5, 12
184	memw
185
186	addi	a3, a3, 16
187	addi	a5, a5, 16
188
189#endif
190
191
192#if !XCHAL_HAVE_LOOPS
193	bltu	a3, a8, 1b
194#endif
195
196	/* Copy any leftover pieces smaller than 16B.  */
1972:	bbci.l	a4, 3, 3f
198
199	/* Copy 8 bytes.  */
200	l32i	a6, a3, 0
201	l32i	a7, a3, 4
202	addi	a3, a3, 8
203	s32i	a6, a5, 0
204	s32i	a7, a5, 4
205	addi	a5, a5, 8
206
2073:	bbsi.l	a4, 2, 4f
208	bbsi.l	a4, 1, 5f
209	bbsi.l	a4, 0, 6f
210#ifdef PSRAM_FIX
211	memw
212#endif
213	leaf_return
214
215	.align 4
216	/* Copy 4 bytes.  */
2174:	l32i	a6, a3, 0
218	addi	a3, a3, 4
219	s32i	a6, a5, 0
220	addi	a5, a5, 4
221	bbsi.l	a4, 1, 5f
222	bbsi.l	a4, 0, 6f
223#ifdef PSRAM_FIX
224	memw
225#endif
226	leaf_return
227
228	/* Copy 2 bytes.  */
2295:	l16ui	a6, a3, 0
230	addi	a3, a3, 2
231	s16i	a6, a5, 0
232	addi	a5, a5, 2
233	bbsi.l	a4, 0, 6f
234#ifdef PSRAM_FIX
235	memw
236#endif
237	leaf_return
238
239	/* Copy 1 byte.  */
2406:	l8ui	a6, a3, 0
241	s8i	a6, a5, 0
242
243.Ldone:
244#ifdef PSRAM_FIX
245	memw
246#endif
247	leaf_return
248
249
250/* Destination is aligned; source is unaligned.  */
251
252	.align	4
253.Lsrcunaligned:
254	/* Avoid loading anything for zero-length copies.  */
255	beqz	a4, .Ldone
256
257	/* Copy 16 bytes per iteration for word-aligned dst and
258	   unaligned src.  */
259	ssa8	a3		// set shift amount from byte offset
260#if UNALIGNED_ADDRESSES_CHECKED
261	srli    a11, a8, 30     // save unalignment offset for below
262	sub	a3, a3, a11	// align a3
263#endif
264	l32i	a6, a3, 0	// load first word
265#if XCHAL_HAVE_LOOPS
266	loopnez	a7, 2f
267#else
268	beqz	a7, 2f
269	slli	a10, a7, 4
270	add	a10, a10, a3	// a10 = end of last 16B source chunk
271#endif
2721:	l32i	a7, a3, 4
273	l32i	a8, a3, 8
274	src_b	a6, a6, a7
275	s32i	a6, a5, 0
276	l32i	a9, a3, 12
277	src_b	a7, a7, a8
278	s32i	a7, a5, 4
279	l32i	a6, a3, 16
280	src_b	a8, a8, a9
281	s32i	a8, a5, 8
282	addi	a3, a3, 16
283	src_b	a9, a9, a6
284	s32i	a9, a5, 12
285	addi	a5, a5, 16
286#if !XCHAL_HAVE_LOOPS
287	bltu	a3, a10, 1b
288#endif
289
2902:	bbci.l	a4, 3, 3f
291
292	/* Copy 8 bytes.  */
293	l32i	a7, a3, 4
294	l32i	a8, a3, 8
295	src_b	a6, a6, a7
296	s32i	a6, a5, 0
297	addi	a3, a3, 8
298	src_b	a7, a7, a8
299	s32i	a7, a5, 4
300	addi	a5, a5, 8
301	mov	a6, a8
302
3033:	bbci.l	a4, 2, 4f
304
305	/* Copy 4 bytes.  */
306	l32i	a7, a3, 4
307	addi	a3, a3, 4
308	src_b	a6, a6, a7
309	s32i	a6, a5, 0
310	addi	a5, a5, 4
311	mov	a6, a7
3124:
313#if UNALIGNED_ADDRESSES_CHECKED
314	add	a3, a3, a11	// readjust a3 with correct misalignment
315#endif
316	bbsi.l	a4, 1, 5f
317	bbsi.l	a4, 0, 6f
318	leaf_return
319
320	/* Copy 2 bytes.  */
3215:	l8ui	a6, a3, 0
322	l8ui	a7, a3, 1
323	addi	a3, a3, 2
324	s8i	a6, a5, 0
325	s8i	a7, a5, 1
326	addi	a5, a5, 2
327	bbsi.l	a4, 0, 6f
328#ifdef PSRAM_FIX
329	memw
330#endif
331	leaf_return
332
333	/* Copy 1 byte.  */
3346:	l8ui	a6, a3, 0
335	s8i	a6, a5, 0
336#ifdef PSRAM_FIX
337	memw
338#endif
339	leaf_return
340
341	.end schedule
342
343	.size	memcpy, . - memcpy
344
345
346