1/*
2 *  Core HAL library functions xthal_memcpy and xthal_bcopy
3 */
4
5/*
6 * Copyright (c) 2003, 2006, 2010 Tensilica Inc.
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining
9 * a copy of this software and associated documentation files (the
10 * "Software"), to deal in the Software without restriction, including
11 * without limitation the rights to use, copy, modify, merge, publish,
12 * distribute, sublicense, and/or sell copies of the Software, and to
13 * permit persons to whom the Software is furnished to do so, subject to
14 * the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included
17 * in all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 */
27
28#include <xtensa/coreasm.h>
29
30
31#ifdef __XTENSA_EB__
32# define BL(b,l)	b
33#else
34# define BL(b,l)	l
35#endif
36
37	.macro	srcel	r, early, late	// combine early and late words, shift into \r
38	src	\r, BL(\early,\late), BL(\late,\early)
39	.endm
40
41	.macro	ssa8f	r	// set shift-amount for shift *from* given 2-bit alignment
42	BL(ssa8b,ssa8l)	\r
43	.endm
44
45	.macro	ssa8t	r	// set shift-amount for shift *to* given 2-bit alignment
46	BL(ssa8l,ssa8b)	\r	// (reverse of ssa8f)
47	.endm
48
49	.macro	s2ll	r, s	// shift-to-later logical (away from zero-addressed byte)
50	BL(srl,sll)	\r, \s
51	.endm
52
53	.macro	s2el	r, s	// shift-to-early logical (towards zero-addressed byte)
54	BL(sll,srl)	\r, \s
55	.endm
56
57/*
58 * void *xthal_memcpy(void *dst, const void *src, size_t len);
59 * void *xthal_bcopy(const void *src, void *dst, size_t len);
60 *
61 * This function is intended to do the same thing as the standard
62 * library function memcpy() (or bcopy()) for most cases.
63 * However, it uses strictly 32-bit load and store instructions
64 * to copy data.  This ensures this function will work
65 * where the source and/or destination references an
66 * instruction RAM or ROM, which can only be accessed
67 * using l32i (IRAM+IROM) and s32i (IRAM).
68 *
69 * The bcopy version is provided here to avoid the overhead
70 * of an extra call, for callers that require this convention.
71 *
72 * The (general case) algorithm is as follows:
73 *   If destination is unaligned, align it by copying 1 to 3 bytes.
74 *   If source is aligned,
75 *     do 16 bytes with a loop, and then finish up with
76 *     8, 4, and 0-3 byte copies conditional on the length;
77 *   else (if source is unaligned),
78 *     do the same, but use SRC to align the source data.
79 *   This code tries to use fall-through branches for the common
80 *     case of aligned source and destination and multiple
81 *     of 4 length.
82 *
83 * Register use:
84 *	a0/ return address
85 *	a1/ stack pointer
86 *	a2/ return value
87 *	a3/ src
88 *	a4/ length
89 *	a5/ dst
90 *	a6/ tmp
91 *	a7/ tmp
92 *	a8/ tmp
93 *	a9/ tmp
94 *	a10/ tmp
95 *	a11/ tmp
96 *	a12/ tmp
97 */
98
99/* xthal_bcopy and xthal_memcpy need to allocate the same stack size
100 * on entry since they share the same function-return code.  Also,
101 * there is more than one return point. */
102
103#define SAVE_A0  0
104#define SAVE_A3  4
105#define SAVE_A4  8
106#define SAVE_A5  12
107#define SAVE_A12 16
108#define STKSIZE  32
109
110
111	.text
112	.align	4
113	.global	xthal_bcopy
114	.type	xthal_bcopy,@function
115xthal_bcopy:
116#ifdef __XTENSA_CALL0_ABI__
117	addi    sp, sp, -STKSIZE
118	s32i    a12, a1, SAVE_A12
119#else
120	entry	sp, 32		// allow for call8 below
121#endif
122	// a2=src, a3=dst, a4=len
123	mov	a5, a3		// copy dst so that a2 is return value
124	mov	a3, a2
125	mov	a2, a5
126	j	.Lcommon	// go to common code for memcpy+bcopy
127
128	.size	xthal_bcopy, . - xthal_bcopy
129
130
131
132/*
133 * Destination is unaligned
134 */
135
136	.align	4
137xthal_memcpy.prefixcode:	// purely for purpose of .size
138.Ldstunaligned:
139	mov	a10, a5
140	mov	a11, a3
141	movi	a12, 4
142	sub	a6, a12, a6	// number of bytes to copy for dst alignment
143	mov	a12, a6
144#ifdef __XTENSA_CALL0_ABI__
145	s32i	a0, a1, SAVE_A0	// preserve live registers
146	s32i	a3, a1, SAVE_A3
147	s32i	a4, a1, SAVE_A4
148	s32i	a5, a1, SAVE_A5
149	call0	xthal_copy123
150	l32i	a0, a1, SAVE_A0	// restore live registers
151	l32i	a3, a1, SAVE_A3
152	l32i	a4, a1, SAVE_A4
153	l32i	a5, a1, SAVE_A5
154	mov	a6, a12		// restore a6 from callee-saved register
155#else
156	call8	xthal_copy123
157#endif
158	add	a5, a5, a6
159	add	a3, a3, a6
160	sub	a4, a4, a6
161	j	.Ldstaligned
162
163	//  Not sure how else to count code that precedes a function, in .size:
164	.size	xthal_memcpy.prefixcode, . - xthal_memcpy.prefixcode
165
166
167	.align	4
168	.global	xthal_memcpy
169	.type	xthal_memcpy,@function
170xthal_memcpy:
171#ifdef __XTENSA_CALL0_ABI__
172	addi    sp, sp, -STKSIZE
173	s32i    a12, a1, SAVE_A12
174#else
175	entry	sp, 32		// allow for call8 below
176#endif
177	// a2=dst, a3=src, a4=len
178	mov	a5, a2			// copy dst so that a2 is return value
179.Lcommon:
180#ifdef __XTENSA_CALL0_ABI__
181	/*
182	 * have to restore the stack
183	 */
184	_bgeui	a4, 4, 1f
185	mov	a12, a0		// preserve return address
186	call0	xthal_copy123
187	mov	a0, a12		// restore return address
188	l32i    a12, a1, SAVE_A12
189	addi    sp, sp, STKSIZE
190	ret
1911:
192#else
193	bltui	a4, 4, xthal_copy123_pastentry	// NOTE: sometimes relaxes
194#endif
195
196	extui	a6, a2, 0, 2		// destination unalignment offset
197	bnez	a6, .Ldstunaligned	// align the destination
198.Ldstaligned:				// return here once dst is aligned
199	srli	a7, a4, 4		// number of loop iterations of 16-bytes each
200	extui	a11, a3, 0, 2		// source unalignment offset
201	_bnez	a11, .Lsrcunaligned	// if source not aligned, use shifting copy
202	/*
203	 * Destination and source are 32-bit aligned, use 32-bit copy.
204	 */
205#if XCHAL_HAVE_LOOPS
206	loopnez	a7, .Loop1done
207#else /* !XCHAL_HAVE_LOOPS */
208	beqz	a7, .Loop1done
209	slli	a8, a7, 4
210	add	a8, a8, a3		// a8 = end of last 16B source chunk
211#endif /* !XCHAL_HAVE_LOOPS */
212.Loop1:
213	l32i	a6, a3,  0
214	l32i	a7, a3,  4
215	s32i	a6, a5,  0
216	l32i	a6, a3,  8
217	s32i	a7, a5,  4
218	l32i	a7, a3, 12
219	s32i	a6, a5,  8
220	addi	a3, a3, 16
221	s32i	a7, a5, 12
222	addi	a5, a5, 16
223#if !XCHAL_HAVE_LOOPS
224	blt	a3, a8, .Loop1
225#endif /* !XCHAL_HAVE_LOOPS */
226.Loop1done:
227	bbci.l	a4, 3, .L2
228	// copy 8 bytes
229	l32i	a6, a3,  0
230	l32i	a7, a3,  4
231	addi	a3, a3,  8
232	s32i	a6, a5,  0
233	s32i	a7, a5,  4
234	addi	a5, a5,  8
235.L2:
236	bbci.l	a4, 2, .L3
237	// copy 4 bytes
238	l32i	a6, a3,  0
239	addi	a3, a3,  4
240	s32i	a6, a5,  0
241	addi	a5, a5,  4
242.L3:
243	//  Copy last 0 to 3 bytes using 32-bit accesses (aligned source and destination):
244	extui	a4, a4, 0, 2	// any bytes to copy?
245	beqz	a4, 1f		// if not, skip this to avoid extraneous loads/stores
246	l32i	a6, a3, 0	// get source word
247	l32i	a7, a5, 0	// get destination word
248	ssa8f	a4		// shift from length (end of source)
249	s2ll	a6, a6		// align source to last byte
250	s2el	a7, a7		// align parts of a7 following modified bytes, to early byte
251	ssa8t	a4		// shift to end of modified destination (length)
252	srcel	a7, a6, a7	// combine source with late-dst to form last word
253	s32i	a7, a5, 0	// update last word
2541:
255
256#ifdef __XTENSA_CALL0_ABI__
257	l32i    a12, a1, SAVE_A12
258	addi    sp, sp, STKSIZE
259	ret
260#else
261	retw
262#endif
263
264	.size	xthal_memcpy, . - xthal_memcpy
265
266
267	//  void xthal_copy123(dst, src, len);
268	//
269	//  Copy from 0 to 3 bytes, using only 32-bit loads and stores,
270	//  with arbitrarily aligned source and destination.
271	//
272	// arg1 = a2 = dst
273	// arg2 = a3 = src
274	// arg3 = a4 = len
275
276	.global	xthal_copy123
277	.type	xthal_copy123,@function
278	.align	4
279xthal_copy123:
280	abi_entry
281
282xthal_copy123_pastentry:
283	_beqz	a4, cdone	// don't load or store if zero bytes
284	//  First get the bytes:
285	movi	a5, ~3
286	and	a5, a3, a5	// align src address
287	l32i	a6, a5, 0
288	l32i	a7, a5, 4
289	ssa8f	a3
290	srcel	a3, a6, a7
291	// a3 now contains source bytes, aligned to 1st byte (memory order)
292	// (source address is no longer needed at this point)
293
294	//  Does destination span two words?:
295	extui	a10, a2, 0, 2	// destination alignment
296	sub	a5, a2, a10	// align destination address
297	l32i	a8, a5, 0	// get first destination word regardless
298	add	a6, a10, a4	// dst_align + len
299	ssa8f	a2		// shift from dst_align (to 1st or last byte)
300	s2ll	a10, a8		// a10 = first part of destination, aligned to last byte
301	bltui	a6, 4, oneword	// branch if destination contained in single word
302
303	//  Two-word destination case:
304	l32i	a8, a5, 4	// get second word
305	ssa8t	a2		// shift to dst_align
306	srcel	a10, a10, a3	// with a10 in early bytes, a3 in later bytes
307	s32i	a10, a5, 0	// update first word
308	addi	a5, a5, 4	// advance to last word for common code below
309	//movi	a10, 0		// not needed, gets dropped
310
311oneword:
312	//  One-word (and two-word) destination case:
313	//	a8 =  contents of last destination word
314	//	a10 = early part of a8 preceding modified bytes, shifted towards last byte
315	//
316	ssa8f	a4		// shift from length (end of source)
317	srcel	a3, a10, a3	// combine early-destination with source, aligned to last byte
318
319	ssa8f	a6		// shift from end of modified destination (dst_align+len)
320	s2el	a8, a8		// align parts of a8 following modified bytes, to early byte
321	ssa8t	a6		// shift to end of modified destination (dst_align+len)
322	srcel	a8, a3, a8	// combine early-dst+source with late-dst to form last word
323	s32i	a8, a5, 0	// update last word
324cdone:	abi_return		// return dst
325
326/*
327 * Destination is aligned, Source is unaligned
328 */
329
330	.align	4
331.Lsrcunaligned:
332	// Copy 16 bytes per iteration for word-aligned dst and unaligned src
333	ssa8f	a3		// set shift amount from byte offset
334#define SIM_CHECKS_ALIGNMENT	1	/* set to 1 when running on ISS (simulator) with the
335					   lint or ferret client, or 0 to save a few cycles */
336#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
337	extui	a11, a3, 0, 2	// save unalignment offset for below
338	sub	a3, a3, a11	// align a3
339#endif
340	l32i	a6, a3, 0	// load first word
341#if XCHAL_HAVE_LOOPS
342	loopnez	a7, .Loop2done
343#else /* !XCHAL_HAVE_LOOPS */
344	beqz	a7, .Loop2done
345	slli	a10, a7, 4
346	add	a10, a10, a3	// a10 = end of last 16B source chunk
347#endif /* !XCHAL_HAVE_LOOPS */
348.Loop2:
349	l32i	a7, a3,  4
350	l32i	a8, a3,  8
351	srcel	a6, a6, a7
352	s32i	a6, a5,  0
353	l32i	a9, a3, 12
354	srcel	a7, a7, a8
355	s32i	a7, a5,  4
356	l32i	a6, a3, 16
357	srcel	a8, a8, a9
358	s32i	a8, a5,  8
359	addi	a3, a3, 16
360	srcel	a9, a9, a6
361	s32i	a9, a5, 12
362	addi	a5, a5, 16
363#if !XCHAL_HAVE_LOOPS
364	blt	a3, a10, .Loop2
365#endif /* !XCHAL_HAVE_LOOPS */
366.Loop2done:
367	bbci.l	a4, 3, .L12
368	// copy 8 bytes
369	l32i	a7, a3,  4
370	l32i	a8, a3,  8
371	srcel	a6, a6, a7
372	s32i	a6, a5,  0
373	addi	a3, a3,  8
374	srcel	a7, a7, a8
375	s32i	a7, a5,  4
376	addi	a5, a5,  8
377	mov	a6, a8
378.L12:
379	bbci.l	a4, 2, .L13
380	// copy 4 bytes
381	l32i	a7, a3,  4
382	addi	a3, a3,  4
383	srcel	a6, a6, a7
384	s32i	a6, a5,  0
385	addi	a5, a5,  4
386	mov	a6, a7
387.L13:
388	//  Copy last 0 to 3 bytes using 32-bit accesses (shifting source, aligned destination):
389	//_beqz	a4[1:0], cdone	// don't load or store if zero bytes
390	l32i	a7, a3, 4	// get source word
391	l32i	a3, a5, 0	// get destination word
392	srcel	a6, a6, a7	// source bytes, aligned to early (1st) byte
393	ssa8f	a4		// shift from length (end of source)
394	s2ll	a6, a6		// combine early-destination with source, aligned to last byte
395	s2el	a3, a3		// align parts of a3 following modified bytes, to early byte
396	ssa8t	a4		// shift to end of modified destination (length)
397	srcel	a3, a6, a3	// combine early-dst+source with late-dst to form last word
398	s32i	a3, a5, 0	// update last word
399.Ldone:
400#ifdef __XTENSA_CALL0_ABI__
401	l32i    a12, a1, SAVE_A12
402	addi    sp, sp, STKSIZE
403	ret
404#else
405	retw
406#endif
407
408	.size	xthal_copy123, . - xthal_copy123
409
410