1/* ANSI C standard library function strcmp.
2
3   Copyright (c) 2001-20012 Tensilica Inc.
4
5   Permission is hereby granted, free of charge, to any person obtaining
6   a copy of this software and associated documentation files (the
7   "Software"), to deal in the Software without restriction, including
8   without limitation the rights to use, copy, modify, merge, publish,
9   distribute, sublicense, and/or sell copies of the Software, and to
10   permit persons to whom the Software is furnished to do so, subject to
11   the following conditions:
12
13   The above copyright notice and this permission notice shall be included
14   in all copies or substantial portions of the Software.
15
16   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19   IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20   CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22   SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
23
24#include <picolibc.h>
25
26#include "xtensa-asm.h"
27
28#define MASK4 0x40404040
29
30
31#if XCHAL_HAVE_L32R
32	.literal .Lmask0, MASK0
33	.literal .Lmask1, MASK1
34	.literal .Lmask2, MASK2
35	.literal .Lmask3, MASK3
36	.literal .Lmask4, MASK4
37#endif /* XCHAL_HAVE_L32R */
38
39	.text
40	.align	4
41	.literal_position
42	.global	strcmp
43	.type	strcmp, @function
44strcmp:
45
46	leaf_entry sp, 16
47	/* a2 = s1, a3 = s2 */
48
49	l8ui	a8, a2, 0	// byte 0 from s1
50	l8ui	a9, a3, 0	// byte 0 from s2
51	movi	a10, 3		// mask
52	bne	a8, a9, .Lretdiff
53
54	or	a11, a2, a3
55	bnone	a11, a10, .Laligned
56
57	xor	a11, a2, a3	// compare low two bits of s1 and s2
58	bany	a11, a10, .Lunaligned	// if they have different alignment
59
60	/* s1/s2 are not word-aligned.  */
61	addi	a2, a2, 1	// advance s1
62	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
63	addi	a3, a3, 1	// advance s2
64	bnone	a2, a10, .Laligned // if s1/s2 now aligned
65	l8ui	a8, a2, 0	// byte 1 from s1
66	l8ui	a9, a3, 0	// byte 1 from s2
67	addi	a2, a2, 1	// advance s1
68	bne	a8, a9, .Lretdiff // if different, return difference
69	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
70	addi	a3, a3, 1	// advance s2
71	bnone	a2, a10, .Laligned // if s1/s2 now aligned
72	l8ui	a8, a2, 0	// byte 2 from s1
73	l8ui	a9, a3, 0	// byte 2 from s2
74	addi	a2, a2, 1	// advance s1
75	bne	a8, a9, .Lretdiff // if different, return difference
76	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
77	addi	a3, a3, 1	// advance s2
78	j	.Laligned
79
80/* s1 and s2 have different alignment.
81
82   If the zero-overhead loop option is available, use an (almost)
83   infinite zero-overhead loop with conditional exits so we only pay
84   for taken branches when exiting the loop.
85
86   Note: It is important for this unaligned case to come before the
87   code for aligned strings, because otherwise some of the branches
88   above cannot reach and have to be transformed to branches around
89   jumps.  The unaligned code is smaller and the branches can reach
90   over it.  */
91
92	.align	4
93#if XCHAL_HAVE_LOOPS
94#if XCHAL_HAVE_DENSITY
95	/* (2 mod 4) alignment for loop instruction */
96#else
97	/* (1 mod 4) alignment for loop instruction */
98	.byte	0
99	.byte	0
100#endif
101#endif
102.Lunaligned:
103#if XCHAL_HAVE_LOOPS
104#if XCHAL_HAVE_DENSITY
105	_movi.n	a8, 0		// set up for the maximum loop count
106#else
107	_movi	a8, 0		// set up for the maximum loop count
108#endif
109	loop	a8, .Lretdiff	// loop forever (almost anyway)
110#endif
111.Lnextbyte:
112	l8ui	a8, a2, 0
113	l8ui	a9, a3, 0
114	addi	a2, a2, 1
115	bne	a8, a9, .Lretdiff
116	addi	a3, a3, 1
117#if XCHAL_HAVE_LOOPS
118	beqz	a8, .Lretdiff
119#else
120	bnez	a8, .Lnextbyte
121#endif
122.Lretdiff:
123	sub	a2, a8, a9
124	leaf_return
125
126/* s1 is word-aligned; s2 is word-aligned.
127
128   If the zero-overhead loop option is available, use an (almost)
129   infinite zero-overhead loop with conditional exits so we only pay
130   for taken branches when exiting the loop.  */
131
132/* New algorithm, relying on the fact that all normal ASCII is between
133   32 and 127.
134
135   Rather than check all bytes for zero:
136   Take one word (4 bytes).  Call it w1.
137   Shift w1 left by one into w1'.
138   Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
139   Check that all 4 bit 6's (one for each byte) are one:
140   If they are, we are definitely not done.
141   If they are not, we are probably done, but need to check for zero.  */
142
143	.align	4
144#if XCHAL_HAVE_LOOPS
145#if !XCHAL_HAVE_L32R
146	/* (2 mod 4) alignment for loop instruction */
147	.byte	0
148	.byte	0
149#endif
150.Laligned:
151#if XCHAL_HAVE_L32R
152	l32r	a4, .Lmask0	// mask for byte 0
153	l32r	a7, .Lmask4
154#else
155	const16	a4, MASK0@h
156	const16	a4, MASK0@l
157	const16	a7, MASK4@h
158	const16	a7, MASK4@l
159#endif
160	/* Loop forever */
1611:
162	loop	a0, .Laligned_done
163
164	/* First unrolled loop body.  */
165	l32i	a8, a2, 0	// get word from s1
166	l32i	a9, a3, 0	// get word from s2
167	slli	a5, a8, 1
168	bne	a8, a9, .Lwne2
169	or	a9, a8, a5
170	bnall	a9, a7, .Lprobeq
171
172	/* Second unrolled loop body.  */
173	l32i	a8, a2, 4	// get word from s1+4
174	l32i	a9, a3, 4	// get word from s2+4
175	slli	a5, a8, 1
176	bne	a8, a9, .Lwne2
177	or	a9, a8, a5
178	bnall	a9, a7, .Lprobeq2
179
180	addi	a2, a2, 8	// advance s1 pointer
181	addi	a3, a3, 8	// advance s2 pointer
182.Laligned_done:
183	j     	1b
184
185.Lprobeq2:
186	/* Adjust pointers to account for the loop unrolling.  */
187	addi	a2, a2, 4
188	addi	a3, a3, 4
189
190#else /* !XCHAL_HAVE_LOOPS */
191
192.Laligned:
193	movi	a4, MASK0	// mask for byte 0
194	movi	a7, MASK4
195	j	.Lfirstword
196.Lnextword:
197	addi	a2, a2, 4	// advance s1 pointer
198	addi	a3, a3, 4	// advance s2 pointer
199.Lfirstword:
200	l32i	a8, a2, 0	// get word from s1
201	l32i	a9, a3, 0	// get word from s2
202	slli	a5, a8, 1
203	bne	a8, a9, .Lwne2
204	or	a9, a8, a5
205	ball	a9, a7, .Lnextword
206#endif /* !XCHAL_HAVE_LOOPS */
207
208	/* align (0 mod 4) */
209.Lprobeq:
210	/* Words are probably equal, but check for sure.
211	   If not, loop over the rest of string using normal algorithm.  */
212
213	bnone	a8, a4, .Leq	// if byte 0 is zero
214#if XCHAL_HAVE_L32R
215	l32r	a5, .Lmask1	// mask for byte 1
216	l32r	a6, .Lmask2	// mask for byte 2
217	bnone	a8, a5, .Leq	// if byte 1 is zero
218	l32r	a7, .Lmask3	// mask for byte 3
219	bnone	a8, a6, .Leq	// if byte 2 is zero
220	bnone	a8, a7, .Leq	// if byte 3 is zero
221	/* align (1 mod 4) */
222#else
223	const16	a5, MASK1@h	// mask for byte 1
224	const16	a5, MASK1@l
225	bnone	a8, a5, .Leq	// if byte 1 is zero
226	const16	a6, MASK2@h	// mask for byte 2
227	const16	a6, MASK2@l
228	bnone	a8, a6, .Leq	// if byte 2 is zero
229	const16	a7, MASK3@h	// mask for byte 3
230	const16	a7, MASK3@l
231	bnone	a8, a7, .Leq	// if byte 3 is zero
232	/* align (2 mod 4) */
233#endif /* XCHAL_HAVE_L32R */
234#if XCHAL_HAVE_DENSITY
235	addi.n	a2, a2, 4	// advance s1 pointer
236	addi.n	a3, a3, 4	// advance s2 pointer
237	/* align (1 mod 4) or (2 mod 4) */
238#else
239	addi	a2, a2, 4	// advance s1 pointer
240	addi	a3, a3, 4	// advance s2 pointer
241	or	a1, a1, a1	// nop
242#if !XCHAL_HAVE_L32R
243	or	a1, a1, a1	// nop
244#endif
245	/* align (2 mod 4) */
246#endif /* XCHAL_HAVE_DENSITY */
247#if XCHAL_HAVE_LOOPS
2481:
249	loop	a0, .Leq	// loop forever (a4 is bigger than max iters)
250	l32i	a8, a2, 0	// get word from s1
251	l32i	a9, a3, 0	// get word from s2
252	addi	a2, a2, 4	// advance s1 pointer
253	bne	a8, a9, .Lwne
254	bnone	a8, a4, .Leq	// if byte 0 is zero
255	bnone	a8, a5, .Leq	// if byte 1 is zero
256	bnone	a8, a6, .Leq	// if byte 2 is zero
257	bnone	a8, a7, .Leq	// if byte 3 is zero
258	addi	a3, a3, 4	// advance s2 pointer
259	j	1b
260#else /* !XCHAL_HAVE_LOOPS */
261
262	j	.Lfirstword2
263.Lnextword2:
264	addi	a3, a3, 4	// advance s2 pointer
265.Lfirstword2:
266	l32i	a8, a2, 0	// get word from s1
267	l32i	a9, a3, 0	// get word from s2
268	addi	a2, a2, 4	// advance s1 pointer
269	bne	a8, a9, .Lwne
270	bnone	a8, a4, .Leq	// if byte 0 is zero
271	bnone	a8, a5, .Leq	// if byte 1 is zero
272	bnone	a8, a6, .Leq	// if byte 2 is zero
273	bany	a8, a7, .Lnextword2	// if byte 3 is zero
274#endif /* !XCHAL_HAVE_LOOPS */
275
276	/* Words are equal; some byte is zero.  */
277.Leq:	movi	a2, 0		// return equal
278	leaf_return
279
280.Lwne2:	/* Words are not equal.  On big-endian processors, if none of the
281	   bytes are zero, the return value can be determined by a simple
282	   comparison.  */
283#ifdef __XTENSA_EB__
284	or	a10, a8, a5
285	bnall	a10, a7, .Lsomezero
286	bgeu	a8, a9, .Lposreturn
287	movi	a2, -1
288	leaf_return
289.Lposreturn:
290	movi	a2, 1
291	leaf_return
292.Lsomezero:	// There is probably some zero byte.
293#endif /* __XTENSA_EB__ */
294.Lwne:	/* Words are not equal.  */
295	xor	a2, a8, a9	// get word with nonzero in byte that differs
296	bany	a2, a4, .Ldiff0	// if byte 0 differs
297	movi	a5, MASK1	// mask for byte 1
298	bnone	a8, a4, .Leq	// if byte 0 is zero
299	bany	a2, a5, .Ldiff1	// if byte 1 differs
300	movi	a6, MASK2	// mask for byte 2
301	bnone	a8, a5, .Leq	// if byte 1 is zero
302	bany	a2, a6, .Ldiff2	// if byte 2 differs
303	bnone	a8, a6, .Leq	// if byte 2 is zero
304#ifdef __XTENSA_EB__
305.Ldiff3:
306.Ldiff2:
307.Ldiff1:
308	/* Byte 0 is equal (at least) and there is a difference before a zero
309	   byte.  Just subtract words to get the return value.
310	   The high order equal bytes cancel, leaving room for the sign.  */
311	sub	a2, a8, a9
312	leaf_return
313
314.Ldiff0:
315	/* Need to make room for the sign, so can't subtract whole words.  */
316	extui	a10, a8, 24, 8
317	extui	a11, a9, 24, 8
318	sub	a2, a10, a11
319	leaf_return
320
321#else /* !__XTENSA_EB__ */
322	/* Little-endian is a little more difficult because can't subtract
323	   whole words.  */
324.Ldiff3:
325	/* Bytes 0-2 are equal; byte 3 is different.
326	   For little-endian need to have a sign bit for the difference.  */
327	extui	a10, a8, 24, 8
328	extui	a11, a9, 24, 8
329	sub	a2, a10, a11
330	leaf_return
331
332.Ldiff0:
333	/* Byte 0 is different.  */
334	extui	a10, a8, 0, 8
335	extui	a11, a9, 0, 8
336	sub	a2, a10, a11
337	leaf_return
338
339.Ldiff1:
340	/* Byte 0 is equal; byte 1 is different.  */
341	extui	a10, a8, 8, 8
342	extui	a11, a9, 8, 8
343	sub	a2, a10, a11
344	leaf_return
345
346.Ldiff2:
347	/* Bytes 0-1 are equal; byte 2 is different.  */
348	extui	a10, a8, 16, 8
349	extui	a11, a9, 16, 8
350	sub	a2, a10, a11
351	leaf_return
352
353#endif /* !__XTENSA_EB */
354
355	.size	strcmp, . - strcmp
356