1/*
2 *  (c) Copyright 1986 HEWLETT-PACKARD COMPANY
3 *
4 *  To anyone who acknowledges that this file is provided "AS IS"
5 *  without any express or implied warranty:
6 *      permission to use, copy, modify, and distribute this file
7 *  for any purpose is hereby granted without fee, provided that
8 *  the above copyright notice and this notice appears in all
9 *  copies, and that the name of Hewlett-Packard Company not be
10 *  used in advertising or publicity pertaining to distribution
11 *  of the software without specific, written prior permission.
12 *  Hewlett-Packard Company makes no representations about the
13 *  suitability of this software for any purpose.
14 */
15
16/*
17	A faster strcpy.
18
19	by
20
21	Jerry Huck (aligned case)
22	Daryl Odnert (equal-alignment case)
23	Edgar Circenis (non-aligned case)
24*/
25/*
26 * strcpy(s1, s2)
27 *
28 * Copy string s2 to s1.  s1 must be large enough.
29 * return s1
30 */
31
32#include <picolibc.h>
33
34#include "DEFS.h"
35
36#define	d_addr		r26
37#define	s_addr		r25
38#define	tmp6		r24
39#define	tmp1		r19
40#define evenside	r19
41#define	tmp2		r20
42#define oddside		r20
43#define	tmp3		r21
44#define	tmp4		r22
45#define	tmp5		arg3
46#define	save		r1
47
48
49ENTRY(strcpy)
50/* Do some quick alignment checking on and fast path both word aligned */
51        extru,<>   s_addr,31,2,tmp6    /*Is source word aligned? */
52        ldwm       4(0,s_addr),oddside /*Assume yes and guess that it
53                                          is double-word aligned. */
54        dep,=      d_addr,29,2,tmp6    /*Is target word aligned? */
55        b          case_analysis
56	copy       d_addr,ret0
57/* Both are aligned.  First source word already loaded assuming that
58   source was oddword aligned.  Fall through (therefore fastest) code
59   shuffles the registers to join the main loop */
60bothaligned:
61	bb,>=    s_addr,29,twoatatime  /*Branch if source was odd aligned*/
62	uxor,nbz oddside,r0,save
63
64/* Even aligned source.  save holds that operand.
65   Do one iteration of the main copy loop juggling the registers to avoid
66   one copy. */
67	b,n	 nullfound
68	ldwm     4(s_addr),oddside
69	stwm     save,4(d_addr)
70	uxor,nbz oddside,r0,save
71	b,n      nullfound
72        ldwm     4(s_addr),evenside
73        stwm     oddside,4(d_addr)
74        uxor,nbz evenside,r0,save
75        b,n      nullfound
76        ldwm     4(s_addr),oddside
77
78/* Main loop body.  Entry expects evenside still to be stored, oddside
79   just loaded. */
80loop:
81        stwm     evenside,4(d_addr)
82        uxor,nbz oddside,r0,save
83
84/* mid loop entry */
85twoatatime:
86        b,n      nullfound
87        ldwm     4(s_addr),evenside
88        stwm     oddside,4(d_addr)
89        uxor,sbz evenside,r0,save
90        b        loop
91        ldwm     4(s_addr),oddside
92
93/* fall through when null found in evenside.  oddside actually loaded */
94nullfound:				/* adjust d_addr and store final word */
95
96	extru,<>	save,7,8,r0         /* pick up leftmost byte */
97	addib,tr,n	1,d_addr,store_final
98	extru,<>	save,15,8,r0
99	addib,tr,n	2,d_addr,store_final
100	extru,<> 	save,23,8,r0
101	addib,tr	3,d_addr,store_final2
102	bv		0(rp)
103	stw		save,0(d_addr)
104
105store_final:
106	bv		0(rp)
107store_final2:
108	stbys,e		save,0(d_addr) 	/* delay slot */
109
110case_analysis:
111
112        blr         tmp6,r0
113        nop
114
115	/* NOTE: the delay slots for the non-aligned cases load a   */
116	/* shift quantity which is TGT-SRC into tmp3.               */
117        /* Note also, the case for both strings being word aligned  */
118	/* is already checked before the BLR is executed, so that   */
119	/* case can never occur.                                    */
120
121                                       /* TGT SRC */
122        nop                            /* 00  00  can't happen */
123        nop
124        b           neg_aligned_copy   /* 00  01  */
125	ldi         -1,tmp3            /* load shift quantity. delay slot */
126        b           neg_aligned_copy   /* 00  10  */
127	ldi         -2,tmp3            /* load shift quantity. delay slot */
128        b           neg_aligned_copy   /* 00  11  */
129	ldi         -3,tmp3            /* load shift quantity. delay slot */
130        b           pos_aligned_copy0  /* 01  00  */
131	ldi         1,tmp3            /* load shift quantity. delay slot */
132        b           equal_alignment_1  /* 01  01  */
133        ldbs,ma     1(s_addr),tmp1
134        b           neg_aligned_copy   /* 01  10  */
135	ldi         -1,tmp3            /* load shift quantity. delay slot */
136        b           neg_aligned_copy   /* 01  11  */
137	ldi         -2,tmp3            /* load shift quantity. delay slot */
138        b           pos_aligned_copy0  /* 10  00  */
139	ldi         2,tmp3            /* load shift quantity. delay slot */
140        b           pos_aligned_copy   /* 10  01  */
141	ldi         1,tmp3            /* load shift quantity. delay slot */
142        b           equal_alignment_2  /* 10  10  */
143        ldhs,ma     2(s_addr),tmp1
144        b           neg_aligned_copy   /* 10  11  */
145	ldi         -1,tmp3            /* load shift quantity. delay slot */
146        b           pos_aligned_copy0  /* 11  00  */
147	ldi         3,tmp3            /* load shift quantity. delay slot */
148        b           pos_aligned_copy   /* 11  01  */
149	ldi         2,tmp3            /* load shift quantity. delay slot */
150        b           pos_aligned_copy   /* 11  10  */
151	ldi         1,tmp3            /* load shift quantity. delay slot */
152        ldbs,ma     1(s_addr),tmp1     /* 11  11  */
153        comiclr,<>  r0,tmp1,r0
154        bv          0(rp)              /* return if 1st byte was null */
155        stbs,ma     tmp1,1(d_addr)     /* store a byte to dst string  */
156        b           bothaligned       /* can now goto word_aligned   */
157        ldwm        4(s_addr),oddside     /* load next word of source    */
158
159equal_alignment_1:
160        comiclr,<>  r0,tmp1,r0      /* nullify next if tmp1 <> 0  */
161        bv          0(rp)           /* return if null byte found  */
162        stbs,ma     tmp1,1(d_addr)  /* store a byte to dst string */
163        ldhs,ma     2(s_addr),tmp1  /* load next halfword         */
164equal_alignment_2:
165        extru,<>    tmp1,23,8,tmp6  /* look at left byte of halfword */
166        bv          0(rp)           /* return if 1st byte was null */
167        stbs,ma     tmp6,1(d_addr)
168        extru,<>    tmp1,31,8,r0
169        bv          0(rp)           /* return if 2nd byte was null */
170        stbs,ma     tmp1,1(d_addr)
171        b           bothaligned
172        ldwm        4(s_addr),oddside  /* load next word              */
173
174/* source and destination are not aligned, so we do it the hard way. */
175
176/* target alignment is greater than source alignment */
177pos_aligned_copy0:
178	addi		-4,s_addr,s_addr
179pos_aligned_copy:
180        extru       d_addr,31,2,tmp6   /* Extract low 2 bits of the dest addr */
181        extru       s_addr,31,2,tmp1   /* Extract low 2 bits of the src addr */
182        dep         r0,31,2,s_addr     /* Compute word address of the source. */
183        sh3add		tmp3,r0,tmp4        /* compute shift amt */
184        ldwm        	4(0,s_addr),tmp2    /* get 1st source word */
185	sh3add		tmp1,r0,save  	    /* setup mask shift amount */
186	mtctl		save,r11	    /* set-up cr11 for mask */
187	zvdepi		-2,32,save	    /* create mask */
188	or		save,tmp2,tmp2	    /* mask unused bytes in src */
189	ldi		-1,tmp1		    /* load tmp1 with 0xffffffff */
190        mtctl        	tmp4,r11            /* shift count -> shift count reg */
191        vshd        	tmp1,tmp2,tmp3      /* position data ! */
192	uxor,nbz	tmp3,r0,save
193	b,n		first_null
194	uxor,nbz	tmp2,r0,save
195	b		nullfound1
196        mtctl        	tmp4,r11            /* re-load shift cnt (delay slot) */
197	b		loop_entry
198        ldwm        	4(0,s_addr),tmp1    /* get next word. delay slot */
199
200neg_aligned_copy:
201        extru       d_addr,31,2,tmp6   /* Extract low 2 bits of the dest addr */
202	extru	    s_addr,31,2,tmp2   /* Extract low 2 bits of the src addr */
203        dep         r0,31,2,s_addr     /* Compute word address of the source. */
204        sh3add		tmp3,r0,tmp4        /* compute shift amt */
205        ldwm         	4(0,s_addr),tmp1    /* load first word from source. */
206/* check to see if next word can be read safely */
207	sh3add		tmp2,r0,save
208        mtctl        	save,r11            /* shift count -> shift count reg */
209	zvdepi		-2,32,save
210	or		save, tmp1, tmp1
211	uxor,nbz	tmp1,r0,save	    /* any nulls in first word? */
212	b		first_null0
213	mtctl		tmp4,r11
214        ldwm        	4(0,s_addr),tmp2    /* load second word from source */
215	combt,=		tmp6,r0,chunk1      /* don't mask if whole word valid */
216        vshd        	tmp1,tmp2,tmp3      /* position data ! */
217	sh3add		tmp6,r0,save  	    /* setup r1 */
218	mtctl		save,r11	    /* set-up cr11 for mask */
219	zvdepi		-2,32,save
220	or		save, tmp3, tmp3
221	uxor,nbz	tmp3,r0,save
222	b,n		first_null
223	uxor,nbz	tmp2,r0,save
224	b		nullfound1
225        mtctl        	tmp4,r11            /* re-load shift cnt (delay slot) */
226	b		loop_entry
227        ldwm        	4(0,s_addr),tmp1    /* get next word. delay slot */
228
229chunk1:
230	uxor,nbz	tmp2,r0,save
231	b		nullfound0
232	vshd		tmp1,tmp2,tmp3
233did_mask:
234        ldwm        	4(0,s_addr),tmp1    /* get next word !  */
235loop_entry:
236        stbys,b,m   	tmp3,4(0,d_addr)    /* store !  */
237
238	uxor,nbz	tmp1, r0, save
239	b		nullfound2
240        vshd        	tmp2,tmp1,tmp3      /* position data !  */
241	ldwm		4(s_addr),tmp2
242	stwm		tmp3,4(d_addr)
243	uxor,sbz	tmp2,r0,save
244	b		did_mask
245nullfound0:
246	vshd		tmp1,tmp2,tmp3	    /* delay slot */
247	uxor,nbz	tmp3,r0,save
248	b,n		nullfound
249nullfound1:
250	stbys,b,m	tmp3,4(0,d_addr)
251	b		nullfound
252	vshd		tmp2,r0,save	    /* delay slot */
253
254nullfound2:
255	uxor,nbz	tmp3,r0,save
256	b,n		nullfound
257	stwm		tmp3,4(d_addr)
258	b		nullfound
259	/* notice that delay slot is in next routine */
260
261first_null0:	/* null found in first word of non-aligned (wrt d_addr) */
262	vshd		tmp1,r0,save	    /* delay slot */
263	combt,=		tmp6,r0,check4
264	extru		save,7,8,tmp4
265first_null:
266	addibt,=	-1,tmp6,check3	/* check last 3 bytes of word */
267	extru   	save,15,8,tmp4
268	addibt,=,n	-1,tmp6,check2	/* check last 2 bytes */
269	bv		0(rp)		/* null in last byte--store and exit */
270	stbys,b		save, 0(d_addr)
271
272check4:
273	combt,=		tmp4,r0,done
274	stbs,ma		tmp4,1(d_addr)
275	extru,<>	save,15,8,tmp4
276check3:
277	combt,=		tmp4,r0,done
278	stbs,ma		tmp4,1(d_addr)
279check2:
280	extru,<>	save,23,8,tmp4
281	bv		0(rp)
282	stbs,ma		tmp4,1(d_addr)
283	bv		0(rp)
284	stbs		r0,0(d_addr)
285
286done:
287EXIT(strcpy)
288