1/* ANSI C standard library function strcmp.
2
3   Copyright (c) 2001-20012 Tensilica Inc.
4
5   Permission is hereby granted, free of charge, to any person obtaining
6   a copy of this software and associated documentation files (the
7   "Software"), to deal in the Software without restriction, including
8   without limitation the rights to use, copy, modify, merge, publish,
9   distribute, sublicense, and/or sell copies of the Software, and to
10   permit persons to whom the Software is furnished to do so, subject to
11   the following conditions:
12
13   The above copyright notice and this permission notice shall be included
14   in all copies or substantial portions of the Software.
15
16   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19   IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20   CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22   SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
23
24#include "xtensa-asm.h"
25
26#define MASK4 0x40404040
27
28
29#if XCHAL_HAVE_L32R
30	.literal .Lmask0, MASK0
31	.literal .Lmask1, MASK1
32	.literal .Lmask2, MASK2
33	.literal .Lmask3, MASK3
34	.literal .Lmask4, MASK4
35#endif /* XCHAL_HAVE_L32R */
36
37	.text
38	.align	4
39	.literal_position
40	.global	strcmp
41	.type	strcmp, @function
42strcmp:
43
44	leaf_entry sp, 16
45	/* a2 = s1, a3 = s2 */
46
47	l8ui	a8, a2, 0	// byte 0 from s1
48	l8ui	a9, a3, 0	// byte 0 from s2
49	movi	a10, 3		// mask
50	bne	a8, a9, .Lretdiff
51
52	or	a11, a2, a3
53	bnone	a11, a10, .Laligned
54
55	xor	a11, a2, a3	// compare low two bits of s1 and s2
56	bany	a11, a10, .Lunaligned	// if they have different alignment
57
58	/* s1/s2 are not word-aligned.  */
59	addi	a2, a2, 1	// advance s1
60	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
61	addi	a3, a3, 1	// advance s2
62	bnone	a2, a10, .Laligned // if s1/s2 now aligned
63	l8ui	a8, a2, 0	// byte 1 from s1
64	l8ui	a9, a3, 0	// byte 1 from s2
65	addi	a2, a2, 1	// advance s1
66	bne	a8, a9, .Lretdiff // if different, return difference
67	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
68	addi	a3, a3, 1	// advance s2
69	bnone	a2, a10, .Laligned // if s1/s2 now aligned
70	l8ui	a8, a2, 0	// byte 2 from s1
71	l8ui	a9, a3, 0	// byte 2 from s2
72	addi	a2, a2, 1	// advance s1
73	bne	a8, a9, .Lretdiff // if different, return difference
74	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
75	addi	a3, a3, 1	// advance s2
76	j	.Laligned
77
78/* s1 and s2 have different alignment.
79
80   If the zero-overhead loop option is available, use an (almost)
81   infinite zero-overhead loop with conditional exits so we only pay
82   for taken branches when exiting the loop.
83
84   Note: It is important for this unaligned case to come before the
85   code for aligned strings, because otherwise some of the branches
86   above cannot reach and have to be transformed to branches around
87   jumps.  The unaligned code is smaller and the branches can reach
88   over it.  */
89
90	.align	4
91#if XCHAL_HAVE_LOOPS
92#if XCHAL_HAVE_DENSITY
93	/* (2 mod 4) alignment for loop instruction */
94#else
95	/* (1 mod 4) alignment for loop instruction */
96	.byte	0
97	.byte	0
98#endif
99#endif
100.Lunaligned:
101#if XCHAL_HAVE_LOOPS
102#if XCHAL_HAVE_DENSITY
103	_movi.n	a8, 0		// set up for the maximum loop count
104#else
105	_movi	a8, 0		// set up for the maximum loop count
106#endif
107	loop	a8, .Lretdiff	// loop forever (almost anyway)
108#endif
109.Lnextbyte:
110	l8ui	a8, a2, 0
111	l8ui	a9, a3, 0
112	addi	a2, a2, 1
113	bne	a8, a9, .Lretdiff
114	addi	a3, a3, 1
115#if XCHAL_HAVE_LOOPS
116	beqz	a8, .Lretdiff
117#else
118	bnez	a8, .Lnextbyte
119#endif
120.Lretdiff:
121	sub	a2, a8, a9
122	leaf_return
123
124/* s1 is word-aligned; s2 is word-aligned.
125
126   If the zero-overhead loop option is available, use an (almost)
127   infinite zero-overhead loop with conditional exits so we only pay
128   for taken branches when exiting the loop.  */
129
130/* New algorithm, relying on the fact that all normal ASCII is between
131   32 and 127.
132
133   Rather than check all bytes for zero:
134   Take one word (4 bytes).  Call it w1.
135   Shift w1 left by one into w1'.
136   Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
137   Check that all 4 bit 6's (one for each byte) are one:
138   If they are, we are definitely not done.
139   If they are not, we are probably done, but need to check for zero.  */
140
141	.align	4
142#if XCHAL_HAVE_LOOPS
143#if !XCHAL_HAVE_L32R
144	/* (2 mod 4) alignment for loop instruction */
145	.byte	0
146	.byte	0
147#endif
148.Laligned:
149#if XCHAL_HAVE_L32R
150	l32r	a4, .Lmask0	// mask for byte 0
151	l32r	a7, .Lmask4
152#else
153	const16	a4, MASK0@h
154	const16	a4, MASK0@l
155	const16	a7, MASK4@h
156	const16	a7, MASK4@l
157#endif
158	/* Loop forever */
1591:
160	loop	a0, .Laligned_done
161
162	/* First unrolled loop body.  */
163	l32i	a8, a2, 0	// get word from s1
164	l32i	a9, a3, 0	// get word from s2
165	slli	a5, a8, 1
166	bne	a8, a9, .Lwne2
167	or	a9, a8, a5
168	bnall	a9, a7, .Lprobeq
169
170	/* Second unrolled loop body.  */
171	l32i	a8, a2, 4	// get word from s1+4
172	l32i	a9, a3, 4	// get word from s2+4
173	slli	a5, a8, 1
174	bne	a8, a9, .Lwne2
175	or	a9, a8, a5
176	bnall	a9, a7, .Lprobeq2
177
178	addi	a2, a2, 8	// advance s1 pointer
179	addi	a3, a3, 8	// advance s2 pointer
180.Laligned_done:
181	j     	1b
182
183.Lprobeq2:
184	/* Adjust pointers to account for the loop unrolling.  */
185	addi	a2, a2, 4
186	addi	a3, a3, 4
187
188#else /* !XCHAL_HAVE_LOOPS */
189
190.Laligned:
191	movi	a4, MASK0	// mask for byte 0
192	movi	a7, MASK4
193	j	.Lfirstword
194.Lnextword:
195	addi	a2, a2, 4	// advance s1 pointer
196	addi	a3, a3, 4	// advance s2 pointer
197.Lfirstword:
198	l32i	a8, a2, 0	// get word from s1
199	l32i	a9, a3, 0	// get word from s2
200	slli	a5, a8, 1
201	bne	a8, a9, .Lwne2
202	or	a9, a8, a5
203	ball	a9, a7, .Lnextword
204#endif /* !XCHAL_HAVE_LOOPS */
205
206	/* align (0 mod 4) */
207.Lprobeq:
208	/* Words are probably equal, but check for sure.
209	   If not, loop over the rest of string using normal algorithm.  */
210
211	bnone	a8, a4, .Leq	// if byte 0 is zero
212#if XCHAL_HAVE_L32R
213	l32r	a5, .Lmask1	// mask for byte 1
214	l32r	a6, .Lmask2	// mask for byte 2
215	bnone	a8, a5, .Leq	// if byte 1 is zero
216	l32r	a7, .Lmask3	// mask for byte 3
217	bnone	a8, a6, .Leq	// if byte 2 is zero
218	bnone	a8, a7, .Leq	// if byte 3 is zero
219	/* align (1 mod 4) */
220#else
221	const16	a5, MASK1@h	// mask for byte 1
222	const16	a5, MASK1@l
223	bnone	a8, a5, .Leq	// if byte 1 is zero
224	const16	a6, MASK2@h	// mask for byte 2
225	const16	a6, MASK2@l
226	bnone	a8, a6, .Leq	// if byte 2 is zero
227	const16	a7, MASK3@h	// mask for byte 3
228	const16	a7, MASK3@l
229	bnone	a8, a7, .Leq	// if byte 3 is zero
230	/* align (2 mod 4) */
231#endif /* XCHAL_HAVE_L32R */
232#if XCHAL_HAVE_DENSITY
233	addi.n	a2, a2, 4	// advance s1 pointer
234	addi.n	a3, a3, 4	// advance s2 pointer
235	/* align (1 mod 4) or (2 mod 4) */
236#else
237	addi	a2, a2, 4	// advance s1 pointer
238	addi	a3, a3, 4	// advance s2 pointer
239	or	a1, a1, a1	// nop
240#if !XCHAL_HAVE_L32R
241	or	a1, a1, a1	// nop
242#endif
243	/* align (2 mod 4) */
244#endif /* XCHAL_HAVE_DENSITY */
245#if XCHAL_HAVE_LOOPS
2461:
247	loop	a0, .Leq	// loop forever (a4 is bigger than max iters)
248	l32i	a8, a2, 0	// get word from s1
249	l32i	a9, a3, 0	// get word from s2
250	addi	a2, a2, 4	// advance s1 pointer
251	bne	a8, a9, .Lwne
252	bnone	a8, a4, .Leq	// if byte 0 is zero
253	bnone	a8, a5, .Leq	// if byte 1 is zero
254	bnone	a8, a6, .Leq	// if byte 2 is zero
255	bnone	a8, a7, .Leq	// if byte 3 is zero
256	addi	a3, a3, 4	// advance s2 pointer
257	j	1b
258#else /* !XCHAL_HAVE_LOOPS */
259
260	j	.Lfirstword2
261.Lnextword2:
262	addi	a3, a3, 4	// advance s2 pointer
263.Lfirstword2:
264	l32i	a8, a2, 0	// get word from s1
265	l32i	a9, a3, 0	// get word from s2
266	addi	a2, a2, 4	// advance s1 pointer
267	bne	a8, a9, .Lwne
268	bnone	a8, a4, .Leq	// if byte 0 is zero
269	bnone	a8, a5, .Leq	// if byte 1 is zero
270	bnone	a8, a6, .Leq	// if byte 2 is zero
271	bany	a8, a7, .Lnextword2	// if byte 3 is zero
272#endif /* !XCHAL_HAVE_LOOPS */
273
274	/* Words are equal; some byte is zero.  */
275.Leq:	movi	a2, 0		// return equal
276	leaf_return
277
278.Lwne2:	/* Words are not equal.  On big-endian processors, if none of the
279	   bytes are zero, the return value can be determined by a simple
280	   comparison.  */
281#ifdef __XTENSA_EB__
282	or	a10, a8, a5
283	bnall	a10, a7, .Lsomezero
284	bgeu	a8, a9, .Lposreturn
285	movi	a2, -1
286	leaf_return
287.Lposreturn:
288	movi	a2, 1
289	leaf_return
290.Lsomezero:	// There is probably some zero byte.
291#endif /* __XTENSA_EB__ */
292.Lwne:	/* Words are not equal.  */
293	xor	a2, a8, a9	// get word with nonzero in byte that differs
294	bany	a2, a4, .Ldiff0	// if byte 0 differs
295	movi	a5, MASK1	// mask for byte 1
296	bnone	a8, a4, .Leq	// if byte 0 is zero
297	bany	a2, a5, .Ldiff1	// if byte 1 differs
298	movi	a6, MASK2	// mask for byte 2
299	bnone	a8, a5, .Leq	// if byte 1 is zero
300	bany	a2, a6, .Ldiff2	// if byte 2 differs
301	bnone	a8, a6, .Leq	// if byte 2 is zero
302#ifdef __XTENSA_EB__
303.Ldiff3:
304.Ldiff2:
305.Ldiff1:
306	/* Byte 0 is equal (at least) and there is a difference before a zero
307	   byte.  Just subtract words to get the return value.
308	   The high order equal bytes cancel, leaving room for the sign.  */
309	sub	a2, a8, a9
310	leaf_return
311
312.Ldiff0:
313	/* Need to make room for the sign, so can't subtract whole words.  */
314	extui	a10, a8, 24, 8
315	extui	a11, a9, 24, 8
316	sub	a2, a10, a11
317	leaf_return
318
319#else /* !__XTENSA_EB__ */
320	/* Little-endian is a little more difficult because can't subtract
321	   whole words.  */
322.Ldiff3:
323	/* Bytes 0-2 are equal; byte 3 is different.
324	   For little-endian need to have a sign bit for the difference.  */
325	extui	a10, a8, 24, 8
326	extui	a11, a9, 24, 8
327	sub	a2, a10, a11
328	leaf_return
329
330.Ldiff0:
331	/* Byte 0 is different.  */
332	extui	a10, a8, 0, 8
333	extui	a11, a9, 0, 8
334	sub	a2, a10, a11
335	leaf_return
336
337.Ldiff1:
338	/* Byte 0 is equal; byte 1 is different.  */
339	extui	a10, a8, 8, 8
340	extui	a11, a9, 8, 8
341	sub	a2, a10, a11
342	leaf_return
343
344.Ldiff2:
345	/* Bytes 0-1 are equal; byte 2 is different.  */
346	extui	a10, a8, 16, 8
347	extui	a11, a9, 16, 8
348	sub	a2, a10, a11
349	leaf_return
350
351#endif /* !__XTENSA_EB */
352
353	.size	strcmp, . - strcmp
354