1/*
2 *  (c) Copyright 1986 HEWLETT-PACKARD COMPANY
3 *
4 *  To anyone who acknowledges that this file is provided "AS IS"
5 *  without any express or implied warranty:
6 *      permission to use, copy, modify, and distribute this file
7 *  for any purpose is hereby granted without fee, provided that
8 *  the above copyright notice and this notice appears in all
9 *  copies, and that the name of Hewlett-Packard Company not be
10 *  used in advertising or publicity pertaining to distribution
11 *  of the software without specific, written prior permission.
12 *  Hewlett-Packard Company makes no representations about the
13 *  suitability of this software for any purpose.
14 */
15
16/*
17	A faster strcpy.
18
19	by
20
21	Jerry Huck (aligned case)
22	Daryl Odnert (equal-alignment case)
23	Edgar Circenis (non-aligned case)
24*/
25/*
26 * strcpy(s1, s2)
27 *
28 * Copy string s2 to s1.  s1 must be large enough.
29 * return s1
30 */
31
32#include "DEFS.h"
33
34#define	d_addr		r26
35#define	s_addr		r25
36#define	tmp6		r24
37#define	tmp1		r19
38#define evenside	r19
39#define	tmp2		r20
40#define oddside		r20
41#define	tmp3		r21
42#define	tmp4		r22
43#define	tmp5		arg3
44#define	save		r1
45
46
47ENTRY(strcpy)
48/* Do some quick alignment checking on and fast path both word aligned */
49        extru,<>   s_addr,31,2,tmp6    /*Is source word aligned? */
50        ldwm       4(0,s_addr),oddside /*Assume yes and guess that it
51                                          is double-word aligned. */
52        dep,=      d_addr,29,2,tmp6    /*Is target word aligned? */
53        b          case_analysis
54	copy       d_addr,ret0
55/* Both are aligned.  First source word already loaded assuming that
56   source was oddword aligned.  Fall through (therefore fastest) code
57   shuffles the registers to join the main loop */
58bothaligned:
59	bb,>=    s_addr,29,twoatatime  /*Branch if source was odd aligned*/
60	uxor,nbz oddside,r0,save
61
62/* Even aligned source.  save holds that operand.
63   Do one iteration of the main copy loop juggling the registers to avoid
64   one copy. */
65	b,n	 nullfound
66	ldwm     4(s_addr),oddside
67	stwm     save,4(d_addr)
68	uxor,nbz oddside,r0,save
69	b,n      nullfound
70        ldwm     4(s_addr),evenside
71        stwm     oddside,4(d_addr)
72        uxor,nbz evenside,r0,save
73        b,n      nullfound
74        ldwm     4(s_addr),oddside
75
76/* Main loop body.  Entry expects evenside still to be stored, oddside
77   just loaded. */
78loop:
79        stwm     evenside,4(d_addr)
80        uxor,nbz oddside,r0,save
81
82/* mid loop entry */
83twoatatime:
84        b,n      nullfound
85        ldwm     4(s_addr),evenside
86        stwm     oddside,4(d_addr)
87        uxor,sbz evenside,r0,save
88        b        loop
89        ldwm     4(s_addr),oddside
90
91/* fall through when null found in evenside.  oddside actually loaded */
92nullfound:				/* adjust d_addr and store final word */
93
94	extru,<>	save,7,8,r0         /* pick up leftmost byte */
95	addib,tr,n	1,d_addr,store_final
96	extru,<>	save,15,8,r0
97	addib,tr,n	2,d_addr,store_final
98	extru,<> 	save,23,8,r0
99	addib,tr	3,d_addr,store_final2
100	bv		0(rp)
101	stw		save,0(d_addr)
102
103store_final:
104	bv		0(rp)
105store_final2:
106	stbys,e		save,0(d_addr) 	/* delay slot */
107
108case_analysis:
109
110        blr         tmp6,r0
111        nop
112
113	/* NOTE: the delay slots for the non-aligned cases load a   */
114	/* shift quantity which is TGT-SRC into tmp3.               */
115        /* Note also, the case for both strings being word aligned  */
116	/* is already checked before the BLR is executed, so that   */
117	/* case can never occur.                                    */
118
119                                       /* TGT SRC */
120        nop                            /* 00  00  can't happen */
121        nop
122        b           neg_aligned_copy   /* 00  01  */
123	ldi         -1,tmp3            /* load shift quantity. delay slot */
124        b           neg_aligned_copy   /* 00  10  */
125	ldi         -2,tmp3            /* load shift quantity. delay slot */
126        b           neg_aligned_copy   /* 00  11  */
127	ldi         -3,tmp3            /* load shift quantity. delay slot */
128        b           pos_aligned_copy0  /* 01  00  */
129	ldi         1,tmp3            /* load shift quantity. delay slot */
130        b           equal_alignment_1  /* 01  01  */
131        ldbs,ma     1(s_addr),tmp1
132        b           neg_aligned_copy   /* 01  10  */
133	ldi         -1,tmp3            /* load shift quantity. delay slot */
134        b           neg_aligned_copy   /* 01  11  */
135	ldi         -2,tmp3            /* load shift quantity. delay slot */
136        b           pos_aligned_copy0  /* 10  00  */
137	ldi         2,tmp3            /* load shift quantity. delay slot */
138        b           pos_aligned_copy   /* 10  01  */
139	ldi         1,tmp3            /* load shift quantity. delay slot */
140        b           equal_alignment_2  /* 10  10  */
141        ldhs,ma     2(s_addr),tmp1
142        b           neg_aligned_copy   /* 10  11  */
143	ldi         -1,tmp3            /* load shift quantity. delay slot */
144        b           pos_aligned_copy0  /* 11  00  */
145	ldi         3,tmp3            /* load shift quantity. delay slot */
146        b           pos_aligned_copy   /* 11  01  */
147	ldi         2,tmp3            /* load shift quantity. delay slot */
148        b           pos_aligned_copy   /* 11  10  */
149	ldi         1,tmp3            /* load shift quantity. delay slot */
150        ldbs,ma     1(s_addr),tmp1     /* 11  11  */
151        comiclr,<>  r0,tmp1,r0
152        bv          0(rp)              /* return if 1st byte was null */
153        stbs,ma     tmp1,1(d_addr)     /* store a byte to dst string  */
154        b           bothaligned       /* can now goto word_aligned   */
155        ldwm        4(s_addr),oddside     /* load next word of source    */
156
157equal_alignment_1:
158        comiclr,<>  r0,tmp1,r0      /* nullify next if tmp1 <> 0  */
159        bv          0(rp)           /* return if null byte found  */
160        stbs,ma     tmp1,1(d_addr)  /* store a byte to dst string */
161        ldhs,ma     2(s_addr),tmp1  /* load next halfword         */
162equal_alignment_2:
163        extru,<>    tmp1,23,8,tmp6  /* look at left byte of halfword */
164        bv          0(rp)           /* return if 1st byte was null */
165        stbs,ma     tmp6,1(d_addr)
166        extru,<>    tmp1,31,8,r0
167        bv          0(rp)           /* return if 2nd byte was null */
168        stbs,ma     tmp1,1(d_addr)
169        b           bothaligned
170        ldwm        4(s_addr),oddside  /* load next word              */
171
172/* source and destination are not aligned, so we do it the hard way. */
173
174/* target alignment is greater than source alignment */
175pos_aligned_copy0:
176	addi		-4,s_addr,s_addr
177pos_aligned_copy:
178        extru       d_addr,31,2,tmp6   /* Extract low 2 bits of the dest addr */
179        extru       s_addr,31,2,tmp1   /* Extract low 2 bits of the src addr */
180        dep         r0,31,2,s_addr     /* Compute word address of the source. */
181        sh3add		tmp3,r0,tmp4        /* compute shift amt */
182        ldwm        	4(0,s_addr),tmp2    /* get 1st source word */
183	sh3add		tmp1,r0,save  	    /* setup mask shift amount */
184	mtctl		save,r11	    /* set-up cr11 for mask */
185	zvdepi		-2,32,save	    /* create mask */
186	or		save,tmp2,tmp2	    /* mask unused bytes in src */
187	ldi		-1,tmp1		    /* load tmp1 with 0xffffffff */
188        mtctl        	tmp4,r11            /* shift count -> shift count reg */
189        vshd        	tmp1,tmp2,tmp3      /* position data ! */
190	uxor,nbz	tmp3,r0,save
191	b,n		first_null
192	uxor,nbz	tmp2,r0,save
193	b		nullfound1
194        mtctl        	tmp4,r11            /* re-load shift cnt (delay slot) */
195	b		loop_entry
196        ldwm        	4(0,s_addr),tmp1    /* get next word. delay slot */
197
198neg_aligned_copy:
199        extru       d_addr,31,2,tmp6   /* Extract low 2 bits of the dest addr */
200	extru	    s_addr,31,2,tmp2   /* Extract low 2 bits of the src addr */
201        dep         r0,31,2,s_addr     /* Compute word address of the source. */
202        sh3add		tmp3,r0,tmp4        /* compute shift amt */
203        ldwm         	4(0,s_addr),tmp1    /* load first word from source. */
204/* check to see if next word can be read safely */
205	sh3add		tmp2,r0,save
206        mtctl        	save,r11            /* shift count -> shift count reg */
207	zvdepi		-2,32,save
208	or		save, tmp1, tmp1
209	uxor,nbz	tmp1,r0,save	    /* any nulls in first word? */
210	b		first_null0
211	mtctl		tmp4,r11
212        ldwm        	4(0,s_addr),tmp2    /* load second word from source */
213	combt,=		tmp6,r0,chunk1      /* don't mask if whole word valid */
214        vshd        	tmp1,tmp2,tmp3      /* position data ! */
215	sh3add		tmp6,r0,save  	    /* setup r1 */
216	mtctl		save,r11	    /* set-up cr11 for mask */
217	zvdepi		-2,32,save
218	or		save, tmp3, tmp3
219	uxor,nbz	tmp3,r0,save
220	b,n		first_null
221	uxor,nbz	tmp2,r0,save
222	b		nullfound1
223        mtctl        	tmp4,r11            /* re-load shift cnt (delay slot) */
224	b		loop_entry
225        ldwm        	4(0,s_addr),tmp1    /* get next word. delay slot */
226
227chunk1:
228	uxor,nbz	tmp2,r0,save
229	b		nullfound0
230	vshd		tmp1,tmp2,tmp3
231did_mask:
232        ldwm        	4(0,s_addr),tmp1    /* get next word !  */
233loop_entry:
234        stbys,b,m   	tmp3,4(0,d_addr)    /* store !  */
235
236	uxor,nbz	tmp1, r0, save
237	b		nullfound2
238        vshd        	tmp2,tmp1,tmp3      /* position data !  */
239	ldwm		4(s_addr),tmp2
240	stwm		tmp3,4(d_addr)
241	uxor,sbz	tmp2,r0,save
242	b		did_mask
243nullfound0:
244	vshd		tmp1,tmp2,tmp3	    /* delay slot */
245	uxor,nbz	tmp3,r0,save
246	b,n		nullfound
247nullfound1:
248	stbys,b,m	tmp3,4(0,d_addr)
249	b		nullfound
250	vshd		tmp2,r0,save	    /* delay slot */
251
252nullfound2:
253	uxor,nbz	tmp3,r0,save
254	b,n		nullfound
255	stwm		tmp3,4(d_addr)
256	b		nullfound
257	/* notice that delay slot is in next routine */
258
259first_null0:	/* null found in first word of non-aligned (wrt d_addr) */
260	vshd		tmp1,r0,save	    /* delay slot */
261	combt,=		tmp6,r0,check4
262	extru		save,7,8,tmp4
263first_null:
264	addibt,=	-1,tmp6,check3	/* check last 3 bytes of word */
265	extru   	save,15,8,tmp4
266	addibt,=,n	-1,tmp6,check2	/* check last 2 bytes */
267	bv		0(rp)		/* null in last byte--store and exit */
268	stbys,b		save, 0(d_addr)
269
270check4:
271	combt,=		tmp4,r0,done
272	stbs,ma		tmp4,1(d_addr)
273	extru,<>	save,15,8,tmp4
274check3:
275	combt,=		tmp4,r0,done
276	stbs,ma		tmp4,1(d_addr)
277check2:
278	extru,<>	save,23,8,tmp4
279	bv		0(rp)
280	stbs,ma		tmp4,1(d_addr)
281	bv		0(rp)
282	stbs		r0,0(d_addr)
283
284done:
285EXIT(strcpy)
286