1#include <picolibc.h>
2
3!
4! Fast SH memcpy
5!
6! by Toshiyasu Morita (tm@netcom.com)
7! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
8/* SH5 code Copyright 2002 SuperH Ltd. */
9!
10! Entry: ARG0: destination pointer
11!        ARG1: source pointer
12!        ARG3: byte count
13!
14! Exit:  RESULT: destination pointer
15!        any other registers in the range r0-r7: trashed
16!
17! Notes: Usually one wants to do small reads and write a longword, but
18!        unfortunately it is difficult in some cases to concatanate bytes
19!        into a longword on the SH, so this does a longword read and small
20!        writes.
21!
22! This implementation makes two assumptions about how it is called:
23!
24! 1.: If the byte count is nonzero, the address of the last byte to be
25!     copied is unsigned greater than the address of the first byte to
26!     be copied.  This could be easily swapped for a signed comparison,
27!     but the algorithm used needs some comparison.
28!
29! 2.: When there are two or three bytes in the last word of an 11-or-more
30!     bytes memory chunk to b copied, the rest of the word can be read
31!     without side effects.
32!     This could be easily changed by increasing the minumum size of
33!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
34!     however, this would cost a few extra cyles on average.
35!     For SHmedia, the assumption is that any quadword can be read in its
36!     enirety if at least one byte is included in the copy.
37!
38
39#include "asm.h"
40
41ENTRY(memcpy)
42
43#if __SHMEDIA__
44
45#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
46#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
47#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
48#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
49
50	ld.b r3,0,r63
51	pta/l Large,tr0
52	movi 25,r0
53	bgeu/u r4,r0,tr0
54	nsb r4,r0
55	shlli r0,5,r0
56	movi (L1-L0+63*32 + 1) & 0xffff,r1
57	sub r1, r0, r0
58L0:	ptrel r0,tr0
59	add r2,r4,r5
60	ptabs r18,tr1
61	add r3,r4,r6
62	blink tr0,r63
63
64	.balign 8
65L1:
66	/* 0 byte memcpy */
67	blink tr1,r63
68
69L4_7:	/* 4..7 byte memcpy cntd. */
70	stlo.l r2, 0, r0
71	or r6, r7, r6
72	sthi.l r5, -1, r6
73	stlo.l r5, -4, r6
74	blink tr1,r63
75
76L2_3:	/* 2 or 3 byte memcpy cntd. */
77	st.b r5,-1,r6
78	blink tr1,r63
79
80	/* 1 byte memcpy */
81	ld.b r3,0,r0
82	st.b r2,0,r0
83	blink tr1,r63
84
85L8_15:	/* 8..15 byte memcpy cntd. */
86	stlo.q r2, 0, r0
87	or r6, r7, r6
88	sthi.q r5, -1, r6
89	stlo.q r5, -8, r6
90	blink tr1,r63
91
92	/* 2 or 3 byte memcpy */
93	ld.b r3,0,r0
94	ld.b r2,0,r63
95	ld.b r3,1,r1
96	st.b r2,0,r0
97	pta/l L2_3,tr0
98	ld.b r6,-1,r6
99	st.b r2,1,r1
100	blink tr0, r63
101
102	/* 4 .. 7 byte memcpy */
103	LDUAL (r3, 0, r0, r1)
104	pta L4_7, tr0
105	ldlo.l r6, -4, r7
106	or r0, r1, r0
107	sthi.l r2, 3, r0
108	ldhi.l r6, -1, r6
109	blink tr0, r63
110
111	/* 8 .. 15 byte memcpy */
112	LDUAQ (r3, 0, r0, r1)
113	pta L8_15, tr0
114	ldlo.q r6, -8, r7
115	or r0, r1, r0
116	sthi.q r2, 7, r0
117	ldhi.q r6, -1, r6
118	blink tr0, r63
119
120	/* 16 .. 24 byte memcpy */
121	LDUAQ (r3, 0, r0, r1)
122	LDUAQ (r3, 8, r8, r9)
123	or r0, r1, r0
124	sthi.q r2, 7, r0
125	or r8, r9, r8
126	sthi.q r2, 15, r8
127	ldlo.q r6, -8, r7
128	ldhi.q r6, -1, r6
129	stlo.q r2, 8, r8
130	stlo.q r2, 0, r0
131	or r6, r7, r6
132	sthi.q r5, -1, r6
133	stlo.q r5, -8, r6
134	blink tr1,r63
135
136Large:
137	ld.b r2, 0, r63
138	pta/l  Loop_ua, tr1
139	ori r3, -8, r7
140	sub r2, r7, r22
141	sub r3, r2, r6
142	add r2, r4, r5
143	ldlo.q r3, 0, r0
144	addi r5, -16, r5
145	movi 64+8, r27 // could subtract r7 from that.
146	stlo.q r2, 0, r0
147	sthi.q r2, 7, r0
148	ldx.q r22, r6, r0
149	bgtu/l r27, r4, tr1
150
151	addi r5, -48, r27
152	pta/l Loop_line, tr0
153	addi r6, 64, r36
154	addi r6, -24, r19
155	addi r6, -16, r20
156	addi r6, -8, r21
157
158Loop_line:
159	ldx.q r22, r36, r63
160	alloco r22, 32
161	addi r22, 32, r22
162	ldx.q r22, r19, r23
163	sthi.q r22, -25, r0
164	ldx.q r22, r20, r24
165	ldx.q r22, r21, r25
166	stlo.q r22, -32, r0
167	ldx.q r22, r6,  r0
168	sthi.q r22, -17, r23
169	sthi.q r22,  -9, r24
170	sthi.q r22,  -1, r25
171	stlo.q r22, -24, r23
172	stlo.q r22, -16, r24
173	stlo.q r22,  -8, r25
174	bgeu r27, r22, tr0
175
176Loop_ua:
177	addi r22, 8, r22
178	sthi.q r22, -1, r0
179	stlo.q r22, -8, r0
180	ldx.q r22, r6, r0
181	bgtu/l r5, r22, tr1
182
183	add r3, r4, r7
184	ldlo.q r7, -8, r1
185	sthi.q r22, 7, r0
186	ldhi.q r7, -1, r7
187	ptabs r18,tr1
188	stlo.q r22, 0, r0
189	or r1, r7, r1
190	sthi.q r5, 15, r1
191	stlo.q r5, 8, r1
192	blink tr1, r63
193
194#else /* ! SHMEDIA, i.e. SH1 .. SH4 / SHcompact */
195
196#ifdef __SH5__
197#define DST r2
198#define SRC r3
199#define COUNT r4
200#define TMP0 r5
201#define TMP1 r6
202#define RESULT r2
203#else
204#define DST r4
205#define SRC r5
206#define COUNT r6
207#define TMP0 r2
208#define TMP1 r3
209#define RESULT r0
210#endif
211
212#ifdef __LITTLE_ENDIAN__
213	! Little endian version copies with increasing addresses.
214	mov DST,TMP1	! Save return value
215	mov #11,r0	! Check if small number of bytes
216	cmp/hs r0,COUNT
217			! COUNT becomes src end address
218	SL(bf, L_small, add SRC,COUNT)
219	mov #1,r1
220	tst r1,SRC	! check if source even
221	SL(bt, L_even, mov COUNT,r7)
222	mov.b @SRC+,r0	! no, make it even.
223	mov.b r0,@DST
224	add #1,DST
225L_even:	tst r1,DST	! check if destination is even
226	add #-3,r7
227	SL(bf, L_odddst, mov #2,r1)
228	tst r1,DST	! check if destination is 4-byte aligned
229	mov DST,r0
230	SL(bt, L_al4dst, sub SRC,r0)
231	mov.w @SRC+,TMP0
232	mov.w TMP0,@DST
233	! add #2,DST  DST is dead here.
234L_al4dst:
235	tst r1,SRC
236	bt L_al4both
237	mov.w @SRC+,r1
238	swap.w r1,r1
239	add #-6,r0
240	add #-6,r7	! r7 := src end address minus 9.
241	.align 2
242L_2l_loop:
243	mov.l @SRC+,TMP0 ! Read & write two longwords per iteration
244	xtrct TMP0,r1
245	mov.l r1,@(r0,SRC)
246	cmp/hs r7,SRC
247	mov.l @SRC+,r1
248	xtrct r1,TMP0
249	mov.l TMP0,@(r0,SRC)
250	bf L_2l_loop
251	add #-2,SRC
252	bra  L_cleanup
253	add #5,r0
254L_al4both:
255	add #-4,r0
256	.align 2
257L_al4both_loop:
258	mov.l @SRC+,DST   ! Read longword, write longword per iteration
259	cmp/hs r7,SRC
260	SL(bf, L_al4both_loop, mov.l DST,@(r0,SRC))
261
262	bra L_cleanup
263	add #3,r0
264
265L_odddst:
266	tst r1,SRC
267	SL(bt, L_al4src, add #-1,DST)
268	mov.w @SRC+,r0
269	mov.b r0,@(1,DST)
270	shlr8 r0
271	mov.b r0,@(2,DST)
272	add #2,DST
273L_al4src:
274	.align 2
275L_odd_loop:
276	mov.l @SRC+,r0   ! Read longword, write byte, word, byte per iteration
277	cmp/hs r7,SRC
278	mov.b r0,@(1,DST)
279	shlr8 r0
280	mov.w r0,@(2,DST)
281	shlr16 r0
282	mov.b r0,@(4,DST)
283	SL(bf, L_odd_loop, add #4,DST)
284	.align 2 ! avoid nop in more frequently executed code.
285L_cleanup2:
286	mov	DST,r0
287	sub	SRC,r0
288L_cleanup:
289	cmp/eq	COUNT,SRC
290	bt	L_ready
291	.align 2
292L_cleanup_loop:
293	mov.b	@SRC+,r1
294	cmp/eq	COUNT,SRC
295	mov.b	r1,@(r0,SRC)
296	bf	L_cleanup_loop
297L_ready:
298	rts
299	mov	TMP1,RESULT
300L_small:
301	bra L_cleanup2
302	add #-1,DST
303#else /* ! __LITTLE_ENDIAN__ */
304	! Big endian version copies with decreasing addresses.
305	mov DST,r0
306	add COUNT,r0
307	sub DST,SRC
308	mov #11,r1
309	cmp/hs r1,COUNT
310	SL(bf, L_small, add #-1,SRC)
311	mov SRC,TMP1
312	add r0,TMP1
313	shlr TMP1
314	SL(bt, L_even,
315	mov DST,r7)
316	mov.b @(r0,SRC),TMP0
317	add #-1,TMP1
318	mov.b TMP0,@-r0
319L_even:
320	tst #1,r0
321	add #-1,SRC
322	SL(bf, L_odddst, add #8,r7)
323	tst #2,r0
324	bt L_al4dst
325	add #-1,TMP1
326	mov.w @(r0,SRC),r1
327	mov.w r1,@-r0
328L_al4dst:
329	shlr TMP1
330	bt L_al4both
331	mov.w @(r0,SRC),r1
332	swap.w r1,r1
333	add #4,r7
334	add #-4,SRC
335	.align 2
336L_2l_loop:
337	mov.l @(r0,SRC),TMP0
338	xtrct TMP0,r1
339	mov.l r1,@-r0
340	cmp/hs r7,r0
341	mov.l @(r0,SRC),r1
342	xtrct r1,TMP0
343	mov.l TMP0,@-r0
344	bt L_2l_loop
345	bra L_cleanup
346	add #5,SRC
347
348	nop ! avoid nop in executed code.
349L_al4both:
350	add #-2,SRC
351	.align 2
352L_al4both_loop:
353	mov.l @(r0,SRC),r1
354	cmp/hs r7,r0
355	SL(bt, L_al4both_loop,
356	mov.l r1,@-r0)
357	bra L_cleanup
358	add #3,SRC
359
360	nop ! avoid nop in executed code.
361L_odddst:
362	shlr TMP1
363	bt L_al4src
364	mov.w @(r0,SRC),r1
365	mov.b r1,@-r0
366	shlr8 r1
367	mov.b r1,@-r0
368L_al4src:
369	add #-2,SRC
370	.align 2
371L_odd_loop:
372	mov.l @(r0,SRC),TMP0
373	cmp/hs r7,r0
374	mov.b TMP0,@-r0
375	shlr8 TMP0
376	mov.w TMP0,@-r0
377	shlr16 TMP0
378	mov.b TMP0,@-r0
379	bt L_odd_loop
380
381	add #3,SRC
382L_cleanup:
383L_small:
384	cmp/eq DST,r0
385	bt L_ready
386	add #1,DST
387	.align 2
388L_cleanup_loop:
389	mov.b @(r0,SRC),TMP0
390	cmp/eq DST,r0
391	mov.b TMP0,@-r0
392	bf L_cleanup_loop
393L_ready:
394	rts
395	mov r0,RESULT
396#endif /* ! __LITTLE_ENDIAN__ */
397#endif /* ! SHMEDIA */
398