1!
2! Fast SH memcpy
3!
4! by Toshiyasu Morita (tm@netcom.com)
5! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
6/* SH5 code Copyright 2002 SuperH Ltd. */
7!
8! Entry: ARG0: destination pointer
9!        ARG1: source pointer
10!        ARG3: byte count
11!
12! Exit:  RESULT: destination pointer
13!        any other registers in the range r0-r7: trashed
14!
15! Notes: Usually one wants to do small reads and write a longword, but
16!        unfortunately it is difficult in some cases to concatanate bytes
17!        into a longword on the SH, so this does a longword read and small
18!        writes.
19!
20! This implementation makes two assumptions about how it is called:
21!
22! 1.: If the byte count is nonzero, the address of the last byte to be
23!     copied is unsigned greater than the address of the first byte to
24!     be copied.  This could be easily swapped for a signed comparison,
25!     but the algorithm used needs some comparison.
26!
27! 2.: When there are two or three bytes in the last word of an 11-or-more
28!     bytes memory chunk to b copied, the rest of the word can be read
29!     without side effects.
30!     This could be easily changed by increasing the minumum size of
31!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
32!     however, this would cost a few extra cyles on average.
33!     For SHmedia, the assumption is that any quadword can be read in its
34!     enirety if at least one byte is included in the copy.
35!
36
37#include "asm.h"
38
39ENTRY(memcpy)
40
41#if __SHMEDIA__
42
43#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
44#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
45#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
46#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
47
48	ld.b r3,0,r63
49	pta/l Large,tr0
50	movi 25,r0
51	bgeu/u r4,r0,tr0
52	nsb r4,r0
53	shlli r0,5,r0
54	movi (L1-L0+63*32 + 1) & 0xffff,r1
55	sub r1, r0, r0
56L0:	ptrel r0,tr0
57	add r2,r4,r5
58	ptabs r18,tr1
59	add r3,r4,r6
60	blink tr0,r63
61
62	.balign 8
63L1:
64	/* 0 byte memcpy */
65	blink tr1,r63
66
67L4_7:	/* 4..7 byte memcpy cntd. */
68	stlo.l r2, 0, r0
69	or r6, r7, r6
70	sthi.l r5, -1, r6
71	stlo.l r5, -4, r6
72	blink tr1,r63
73
74L2_3:	/* 2 or 3 byte memcpy cntd. */
75	st.b r5,-1,r6
76	blink tr1,r63
77
78	/* 1 byte memcpy */
79	ld.b r3,0,r0
80	st.b r2,0,r0
81	blink tr1,r63
82
83L8_15:	/* 8..15 byte memcpy cntd. */
84	stlo.q r2, 0, r0
85	or r6, r7, r6
86	sthi.q r5, -1, r6
87	stlo.q r5, -8, r6
88	blink tr1,r63
89
90	/* 2 or 3 byte memcpy */
91	ld.b r3,0,r0
92	ld.b r2,0,r63
93	ld.b r3,1,r1
94	st.b r2,0,r0
95	pta/l L2_3,tr0
96	ld.b r6,-1,r6
97	st.b r2,1,r1
98	blink tr0, r63
99
100	/* 4 .. 7 byte memcpy */
101	LDUAL (r3, 0, r0, r1)
102	pta L4_7, tr0
103	ldlo.l r6, -4, r7
104	or r0, r1, r0
105	sthi.l r2, 3, r0
106	ldhi.l r6, -1, r6
107	blink tr0, r63
108
109	/* 8 .. 15 byte memcpy */
110	LDUAQ (r3, 0, r0, r1)
111	pta L8_15, tr0
112	ldlo.q r6, -8, r7
113	or r0, r1, r0
114	sthi.q r2, 7, r0
115	ldhi.q r6, -1, r6
116	blink tr0, r63
117
118	/* 16 .. 24 byte memcpy */
119	LDUAQ (r3, 0, r0, r1)
120	LDUAQ (r3, 8, r8, r9)
121	or r0, r1, r0
122	sthi.q r2, 7, r0
123	or r8, r9, r8
124	sthi.q r2, 15, r8
125	ldlo.q r6, -8, r7
126	ldhi.q r6, -1, r6
127	stlo.q r2, 8, r8
128	stlo.q r2, 0, r0
129	or r6, r7, r6
130	sthi.q r5, -1, r6
131	stlo.q r5, -8, r6
132	blink tr1,r63
133
134Large:
135	ld.b r2, 0, r63
136	pta/l  Loop_ua, tr1
137	ori r3, -8, r7
138	sub r2, r7, r22
139	sub r3, r2, r6
140	add r2, r4, r5
141	ldlo.q r3, 0, r0
142	addi r5, -16, r5
143	movi 64+8, r27 // could subtract r7 from that.
144	stlo.q r2, 0, r0
145	sthi.q r2, 7, r0
146	ldx.q r22, r6, r0
147	bgtu/l r27, r4, tr1
148
149	addi r5, -48, r27
150	pta/l Loop_line, tr0
151	addi r6, 64, r36
152	addi r6, -24, r19
153	addi r6, -16, r20
154	addi r6, -8, r21
155
156Loop_line:
157	ldx.q r22, r36, r63
158	alloco r22, 32
159	addi r22, 32, r22
160	ldx.q r22, r19, r23
161	sthi.q r22, -25, r0
162	ldx.q r22, r20, r24
163	ldx.q r22, r21, r25
164	stlo.q r22, -32, r0
165	ldx.q r22, r6,  r0
166	sthi.q r22, -17, r23
167	sthi.q r22,  -9, r24
168	sthi.q r22,  -1, r25
169	stlo.q r22, -24, r23
170	stlo.q r22, -16, r24
171	stlo.q r22,  -8, r25
172	bgeu r27, r22, tr0
173
174Loop_ua:
175	addi r22, 8, r22
176	sthi.q r22, -1, r0
177	stlo.q r22, -8, r0
178	ldx.q r22, r6, r0
179	bgtu/l r5, r22, tr1
180
181	add r3, r4, r7
182	ldlo.q r7, -8, r1
183	sthi.q r22, 7, r0
184	ldhi.q r7, -1, r7
185	ptabs r18,tr1
186	stlo.q r22, 0, r0
187	or r1, r7, r1
188	sthi.q r5, 15, r1
189	stlo.q r5, 8, r1
190	blink tr1, r63
191
192#else /* ! SHMEDIA, i.e. SH1 .. SH4 / SHcompact */
193
194#ifdef __SH5__
195#define DST r2
196#define SRC r3
197#define COUNT r4
198#define TMP0 r5
199#define TMP1 r6
200#define RESULT r2
201#else
202#define DST r4
203#define SRC r5
204#define COUNT r6
205#define TMP0 r2
206#define TMP1 r3
207#define RESULT r0
208#endif
209
210#ifdef __LITTLE_ENDIAN__
211	! Little endian version copies with increasing addresses.
212	mov DST,TMP1	! Save return value
213	mov #11,r0	! Check if small number of bytes
214	cmp/hs r0,COUNT
215			! COUNT becomes src end address
216	SL(bf, L_small, add SRC,COUNT)
217	mov #1,r1
218	tst r1,SRC	! check if source even
219	SL(bt, L_even, mov COUNT,r7)
220	mov.b @SRC+,r0	! no, make it even.
221	mov.b r0,@DST
222	add #1,DST
223L_even:	tst r1,DST	! check if destination is even
224	add #-3,r7
225	SL(bf, L_odddst, mov #2,r1)
226	tst r1,DST	! check if destination is 4-byte aligned
227	mov DST,r0
228	SL(bt, L_al4dst, sub SRC,r0)
229	mov.w @SRC+,TMP0
230	mov.w TMP0,@DST
231	! add #2,DST  DST is dead here.
232L_al4dst:
233	tst r1,SRC
234	bt L_al4both
235	mov.w @SRC+,r1
236	swap.w r1,r1
237	add #-6,r0
238	add #-6,r7	! r7 := src end address minus 9.
239	.align 2
240L_2l_loop:
241	mov.l @SRC+,TMP0 ! Read & write two longwords per iteration
242	xtrct TMP0,r1
243	mov.l r1,@(r0,SRC)
244	cmp/hs r7,SRC
245	mov.l @SRC+,r1
246	xtrct r1,TMP0
247	mov.l TMP0,@(r0,SRC)
248	bf L_2l_loop
249	add #-2,SRC
250	bra  L_cleanup
251	add #5,r0
252L_al4both:
253	add #-4,r0
254	.align 2
255L_al4both_loop:
256	mov.l @SRC+,DST   ! Read longword, write longword per iteration
257	cmp/hs r7,SRC
258	SL(bf, L_al4both_loop, mov.l DST,@(r0,SRC))
259
260	bra L_cleanup
261	add #3,r0
262
263L_odddst:
264	tst r1,SRC
265	SL(bt, L_al4src, add #-1,DST)
266	mov.w @SRC+,r0
267	mov.b r0,@(1,DST)
268	shlr8 r0
269	mov.b r0,@(2,DST)
270	add #2,DST
271L_al4src:
272	.align 2
273L_odd_loop:
274	mov.l @SRC+,r0   ! Read longword, write byte, word, byte per iteration
275	cmp/hs r7,SRC
276	mov.b r0,@(1,DST)
277	shlr8 r0
278	mov.w r0,@(2,DST)
279	shlr16 r0
280	mov.b r0,@(4,DST)
281	SL(bf, L_odd_loop, add #4,DST)
282	.align 2 ! avoid nop in more frequently executed code.
283L_cleanup2:
284	mov	DST,r0
285	sub	SRC,r0
286L_cleanup:
287	cmp/eq	COUNT,SRC
288	bt	L_ready
289	.align 2
290L_cleanup_loop:
291	mov.b	@SRC+,r1
292	cmp/eq	COUNT,SRC
293	mov.b	r1,@(r0,SRC)
294	bf	L_cleanup_loop
295L_ready:
296	rts
297	mov	TMP1,RESULT
298L_small:
299	bra L_cleanup2
300	add #-1,DST
301#else /* ! __LITTLE_ENDIAN__ */
302	! Big endian version copies with decreasing addresses.
303	mov DST,r0
304	add COUNT,r0
305	sub DST,SRC
306	mov #11,r1
307	cmp/hs r1,COUNT
308	SL(bf, L_small, add #-1,SRC)
309	mov SRC,TMP1
310	add r0,TMP1
311	shlr TMP1
312	SL(bt, L_even,
313	mov DST,r7)
314	mov.b @(r0,SRC),TMP0
315	add #-1,TMP1
316	mov.b TMP0,@-r0
317L_even:
318	tst #1,r0
319	add #-1,SRC
320	SL(bf, L_odddst, add #8,r7)
321	tst #2,r0
322	bt L_al4dst
323	add #-1,TMP1
324	mov.w @(r0,SRC),r1
325	mov.w r1,@-r0
326L_al4dst:
327	shlr TMP1
328	bt L_al4both
329	mov.w @(r0,SRC),r1
330	swap.w r1,r1
331	add #4,r7
332	add #-4,SRC
333	.align 2
334L_2l_loop:
335	mov.l @(r0,SRC),TMP0
336	xtrct TMP0,r1
337	mov.l r1,@-r0
338	cmp/hs r7,r0
339	mov.l @(r0,SRC),r1
340	xtrct r1,TMP0
341	mov.l TMP0,@-r0
342	bt L_2l_loop
343	bra L_cleanup
344	add #5,SRC
345
346	nop ! avoid nop in executed code.
347L_al4both:
348	add #-2,SRC
349	.align 2
350L_al4both_loop:
351	mov.l @(r0,SRC),r1
352	cmp/hs r7,r0
353	SL(bt, L_al4both_loop,
354	mov.l r1,@-r0)
355	bra L_cleanup
356	add #3,SRC
357
358	nop ! avoid nop in executed code.
359L_odddst:
360	shlr TMP1
361	bt L_al4src
362	mov.w @(r0,SRC),r1
363	mov.b r1,@-r0
364	shlr8 r1
365	mov.b r1,@-r0
366L_al4src:
367	add #-2,SRC
368	.align 2
369L_odd_loop:
370	mov.l @(r0,SRC),TMP0
371	cmp/hs r7,r0
372	mov.b TMP0,@-r0
373	shlr8 TMP0
374	mov.w TMP0,@-r0
375	shlr16 TMP0
376	mov.b TMP0,@-r0
377	bt L_odd_loop
378
379	add #3,SRC
380L_cleanup:
381L_small:
382	cmp/eq DST,r0
383	bt L_ready
384	add #1,DST
385	.align 2
386L_cleanup_loop:
387	mov.b @(r0,SRC),TMP0
388	cmp/eq DST,r0
389	mov.b TMP0,@-r0
390	bf L_cleanup_loop
391L_ready:
392	rts
393	mov r0,RESULT
394#endif /* ! __LITTLE_ENDIAN__ */
395#endif /* ! SHMEDIA */
396