1/* ANSI C standard library function strcmp.
2
3   Copyright (c) 2001-20012 Tensilica Inc.
4
5   Permission is hereby granted, free of charge, to any person obtaining
6   a copy of this software and associated documentation files (the
7   "Software"), to deal in the Software without restriction, including
8   without limitation the rights to use, copy, modify, merge, publish,
9   distribute, sublicense, and/or sell copies of the Software, and to
10   permit persons to whom the Software is furnished to do so, subject to
11   the following conditions:
12
13   The above copyright notice and this permission notice shall be included
14   in all copies or substantial portions of the Software.
15
16   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19   IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20   CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22   SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
23
24#include "xtensa-asm.h"
25
26#define MASK4 0x40404040
27
28
29#if XCHAL_HAVE_L32R
30	.literal .Lmask0, MASK0
31	.literal .Lmask1, MASK1
32	.literal .Lmask2, MASK2
33	.literal .Lmask3, MASK3
34	.literal .Lmask4, MASK4
35#endif /* XCHAL_HAVE_L32R */
36
37	.text
38	.align	4
39	.literal_position
40	.global	strcmp
41	.type	strcmp, @function
42strcmp:
43
44#if XCHAL_HAVE_L32R && XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && __XTENSA_EL__ && XCHAL_HAVE_FLIX3
45/*  Fast version for FLIX3 Little Endian */
46
47
48	leaf_entry sp, 16
49	/* a2 = s1, a3 = s2 */
50
51	l8ui	a8, a2, 0	// byte 0 from s1
52	l8ui	a9, a3, 0	// byte 0 from s2
53	movi	a10, 3		// mask
54	movi	a5, 0xfffffffc
55	or	a11, a2, a3
56	l32r	a4, .Lmask0	// mask for byte 0
57	l32r	a7, .Lmask4
58	addi    a3, a3, -8
59	addi    a2, a2, -8
60	and	a5, a5, a2
61	bne.w18	a8, a9, .Lretdiff
62	l32i	a8, a5, 8	// get word from aligned variant of s1
63
64	bany.w18	a11, a10, .Lnot_aligned
65
66/* s1 is word-aligned; s2 is word-aligned.
67
68   If the zero-overhead loop option is available, use an (almost)
69   infinite zero-overhead loop with conditional exits so we only pay
70   for taken branches when exiting the loop.  */
71
72/* New algorithm, relying on the fact that all normal ASCII is between
73   32 and 127.
74
75   Rather than check all bytes for zero:
76   Take one word (4 bytes).  Call it w1.
77   Shift w1 left by one into w1'.
78   Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
79   Check that all 4 bit 6's (one for each byte) are one:
80   If they are, we are definitely not done.
81   If they are not, we are probably done, but need to check for zero.  */
82
83.Laligned:
84	/* Loop forever */
851:
86	loop	a0, .Laligned_done
87
88	/* First unrolled loop body.  */
89	l32i	a9, a3, 8	// get word from s2
90	addi	a3, a3, 8	// advance s2 pointer
91	slli	a5, a8, 1
92	or	a10, a8, a5
93	{l32i	a11, a2, 12	// get word from s1+4
94	bne.w18	a8, a9, .Lwne2}
95	l32i	a9, a3, 4	// get word from s2+4
96	bnall.w18	a10, a7, .Lprobeq
97
98	/* Second unrolled loop body.  */
99	slli	a5, a11, 1
100	or	a10, a11, a5
101	addi	a2, a2, 8	// advance s1 pointer
102        mov	a8, a11
103	bne.w18	a11, a9, .Lwne2
104	l32i	a8, a2, 8	// get word from s1
105	bnall.w18	a10, a7, .Lprobeq2
106
107.Laligned_done:
108	l32i	a8, a2, 8	// get word from s1
109	j     	1b
110
111.Lnot_aligned:
112	xor	a11, a2, a3	// compare low two bits of s1 and s2
113	bany	a11, a10, .Lunaligned	// if they have different alignment
114
115	/* s1/s2 are not word-aligned.  */
116	movi	a5, 0xfffffffc
117	addi	a2, a2, 1	// advance s1
118	beqz	a9, .Leq	// bytes equal, if zero, strings are equal
119	addi	a3, a3, 1	// advance s2
120	and     a6, a2, a5
121	l32i	a8, a6, 8	// get word from s1
122	bnone	a2, a10, .Laligned // if s1/s2 now aligned
123	l8ui	a8, a2, 8	// byte 1 from s1
124	l8ui	a9, a3, 8	// byte 1 from s2
125	addi	a2, a2, 1	// advance s1
126	bne	a8, a9, .Lretdiff // if different, return difference
127	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
128	addi	a3, a3, 1	// advance s2
129	and     a6, a2, a5
130	l32i	a8, a6, 8	// get word from s1
131	bnone	a2, a10, .Laligned // if s1/s2 now aligned
132	l8ui	a8, a2, 8	// byte 2 from s1
133	l8ui	a9, a3, 8	// byte 2 from s2
134	addi	a2, a2, 1	// advance s1
135	bne	a8, a9, .Lretdiff // if different, return difference
136	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
137	addi	a3, a3, 1	// advance s2
138	l32i	a8, a2, 8	// get word from s1
139	j	.Laligned
140
141/* s1 and s2 have different alignment.
142
143   If the zero-overhead loop option is available, use an (almost)
144   infinite zero-overhead loop with conditional exits so we only pay
145   for taken branches when exiting the loop.
146
147   Note: It is important for this unaligned case to come before the
148   code for aligned strings, because otherwise some of the branches
149   above cannot reach and have to be transformed to branches around
150   jumps.  The unaligned code is smaller and the branches can reach
151   over it.  */
152
153.Lunaligned:
154	movi.n	a8, 0		// set up for the maximum loop count
155	loop	a8, .Lretdiff	// loop forever (almost anyway)
156	l8ui	a8, a2, 8
157	l8ui	a9, a3, 8
158	addi	a2, a2, 1
159	bne	a8, a9, .Lretdiff
160	addi	a3, a3, 1
161	beqz	a8, .Lretdiff
162.Lretdiff:
163	sub	a2, a8, a9
164	leaf_return
165
166
167.Lprobeq2:
168	/* Adjust pointers to account for the loop unrolling.  */
169        mov	a8, a11
170	addi	a2, a2, -4
171	addi	a3, a3, 4
172
173	/* align (0 mod 4) */
174.Lprobeq:
175	/* Words are probably equal, but check for sure.
176	   If not, loop over the rest of string using normal algorithm.  */
177
178	bnone	a8, a4, .Leq	// if byte 0 is zero
179	l32r	a5, .Lmask1	// mask for byte 1
180	l32r	a6, .Lmask2	// mask for byte 2
181	bnone	a8, a5, .Leq	// if byte 1 is zero
182	l32r	a7, .Lmask3	// mask for byte 3
183	bnone	a8, a6, .Leq	// if byte 2 is zero
184	bnone	a8, a7, .Leq	// if byte 3 is zero
185	/* align (1 mod 4) */
186	addi.n	a2, a2, 12	// advance s1 pointer
187	addi.n	a3, a3, 4	// advance s2 pointer
188	/* align (1 mod 4) or (2 mod 4) */
1891:
190	loop	a0, .Lend	// loop forever (a4 is bigger than max iters)
191
192	l32i	a11, a2, 0	// get word from s1
193	l32i	a9, a3, 0	// get word from s2
194	addi	a2, a2, 4	// advance s1 pointer
195	bne	a11, a9, .Lwne
196	bnone	a11, a4, .Leq	// if byte 0 is zero
197	bnone	a11, a5, .Leq	// if byte 1 is zero
198	bnone	a11, a6, .Leq	// if byte 2 is zero
199	bnone	a11, a7, .Leq	// if byte 3 is zero
200	addi	a3, a3, 4	// advance s2 pointer
201.Lend:
202	j	1b
203
204	/* Words are equal; some byte is zero.  */
205.Leq:	movi	a2, 0		// return equal
206	leaf_return
207
208.Lwne2:	/* Words are not equal.  On big-endian processors, if none of the
209	   bytes are zero, the return value can be determined by a simple
210	   comparison.  */
211.Lwne:	/* Words are not equal.  */
212	xor	a2, a8, a9	// get word with nonzero in byte that differs
213	extui	a10, a8, 0, 8
214	extui	a11, a9, 0, 8
215	movi	a5, MASK1	// mask for byte 1
216	bany.w18	a2, a4, .Ldiff0	// if byte 0 differs
217
218	bnone.w18	a8, a4, .Leq	// if byte 0 is zero
219	movi	a6, MASK2	// mask for byte 2
220	bany.w18	a2, a5, .Ldiff1	// if byte 1 differs
221	extui	a10, a8, 24, 8
222	bnone.w18	a8, a5, .Leq	// if byte 1 is zero
223	extui	a11, a9, 24, 8
224	bany.w18	a2, a6, .Ldiff2	// if byte 2 differs
225	sub	a2, a10, a11
226	bnone.w18	a8, a6, .Leq	// if byte 2 is zero
227	/* Little-endian is a little more difficult because can't subtract
228	   whole words.  */
229.Ldiff3:
230	/* Bytes 0-2 are equal; byte 3 is different.
231	   For little-endian need to have a sign bit for the difference.  */
232	leaf_return
233.Ldiff0:
234	/* Byte 0 is different.  */
235	sub	a2, a10, a11
236	leaf_return
237
238.Ldiff1:
239	/* Byte 0 is equal; byte 1 is different.  */
240	extui	a10, a8, 8, 8
241	extui	a11, a9, 8, 8
242	sub	a2, a10, a11
243	leaf_return
244
245.Ldiff2:
246	/* Bytes 0-1 are equal; byte 2 is different.  */
247	extui	a10, a8, 16, 8
248	extui	a11, a9, 16, 8
249	sub	a2, a10, a11
250	leaf_return
251
252#else
253#if XCHAL_HAVE_L32R && XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && __XTENSA_EL__ && XCHAL_HAVE_PDX4
254/*  Fast version for FLIX3 Little Endian */
255
256
257	leaf_entry sp, 16
258	/* a2 = s1, a3 = s2 */
259
260	l8ui	a8, a2, 0	// byte 0 from s1
261	l8ui	a9, a3, 0	// byte 0 from s2
262	movi	a10, 3		// mask
263	movi	a5, 0xfffffffc
264	or	a11, a2, a3
265	l32r	a4, .Lmask0	// mask for byte 0
266	l32r	a7, .Lmask4
267	addi    a3, a3, -8
268	addi    a2, a2, -8
269	and	a5, a5, a2
270	bne.w15	a8, a9, .Lretdiff
271	l32i	a8, a5, 8	// get word from aligned variant of s1
272
273	bany.w15	a11, a10, .Lnot_aligned
274
275/* s1 is word-aligned; s2 is word-aligned.
276
277   If the zero-overhead loop option is available, use an (almost)
278   infinite zero-overhead loop with conditional exits so we only pay
279   for taken branches when exiting the loop.  */
280
281/* New algorithm, relying on the fact that all normal ASCII is between
282   32 and 127.
283
284   Rather than check all bytes for zero:
285   Take one word (4 bytes).  Call it w1.
286   Shift w1 left by one into w1'.
287   Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
288   Check that all 4 bit 6's (one for each byte) are one:
289   If they are, we are definitely not done.
290   If they are not, we are probably done, but need to check for zero.  */
291
292.Laligned:
293	/* Loop forever */
2941:
295	loop	a0, .Laligned_done
296
297	/* First unrolled loop body.  */
298	l32i	a9, a3, 8	// get word from s2
299	addi	a3, a3, 8	// advance s2 pointer
300	slli	a5, a8, 1
301	or	a10, a8, a5
302	{
303	bne.w15	a8, a9, .Lwne2
304	l32i	a11, a2, 12	// get word from s1+4
305	nop
306	nop
307	}
308	l32i	a9, a3, 4	// get word from s2+4
309	bnall.w15	a10, a7, .Lprobeq
310
311	/* Second unrolled loop body.  */
312	slli	a5, a11, 1
313	or	a10, a11, a5
314	addi	a2, a2, 8	// advance s1 pointer
315        mov	a8, a11
316	bne.w15	a11, a9, .Lwne2
317	l32i	a8, a2, 8	// get word from s1
318	bnall.w15	a10, a7, .Lprobeq2
319
320.Laligned_done:
321	l32i	a8, a2, 8	// get word from s1
322	j     	1b
323
324.Lnot_aligned:
325	xor	a11, a2, a3	// compare low two bits of s1 and s2
326	bany	a11, a10, .Lunaligned	// if they have different alignment
327
328	/* s1/s2 are not word-aligned.  */
329	movi	a5, 0xfffffffc
330	addi	a2, a2, 1	// advance s1
331	beqz	a9, .Leq	// bytes equal, if zero, strings are equal
332	addi	a3, a3, 1	// advance s2
333	and     a6, a2, a5
334	l32i	a8, a6, 8	// get word from s1
335	bnone	a2, a10, .Laligned // if s1/s2 now aligned
336	l8ui	a8, a2, 8	// byte 1 from s1
337	l8ui	a9, a3, 8	// byte 1 from s2
338	addi	a2, a2, 1	// advance s1
339	bne	a8, a9, .Lretdiff // if different, return difference
340	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
341	addi	a3, a3, 1	// advance s2
342	and     a6, a2, a5
343	l32i	a8, a6, 8	// get word from s1
344	bnone	a2, a10, .Laligned // if s1/s2 now aligned
345	l8ui	a8, a2, 8	// byte 2 from s1
346	l8ui	a9, a3, 8	// byte 2 from s2
347	addi	a2, a2, 1	// advance s1
348	bne	a8, a9, .Lretdiff // if different, return difference
349	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
350	addi	a3, a3, 1	// advance s2
351	l32i	a8, a2, 8	// get word from s1
352	j	.Laligned
353
354/* s1 and s2 have different alignment.
355
356   If the zero-overhead loop option is available, use an (almost)
357   infinite zero-overhead loop with conditional exits so we only pay
358   for taken branches when exiting the loop.
359
360   Note: It is important for this unaligned case to come before the
361   code for aligned strings, because otherwise some of the branches
362   above cannot reach and have to be transformed to branches around
363   jumps.  The unaligned code is smaller and the branches can reach
364   over it.  */
365
366.Lunaligned:
367	movi.n	a8, 0		// set up for the maximum loop count
368	loop	a8, .Lretdiff	// loop forever (almost anyway)
369	l8ui	a8, a2, 8
370	l8ui	a9, a3, 8
371	addi	a2, a2, 1
372	bne	a8, a9, .Lretdiff
373	addi	a3, a3, 1
374	beqz	a8, .Lretdiff
375.Lretdiff:
376	sub	a2, a8, a9
377	leaf_return
378
379
380.Lprobeq2:
381	/* Adjust pointers to account for the loop unrolling.  */
382        mov	a8, a11
383	addi	a2, a2, -4
384	addi	a3, a3, 4
385
386	/* align (0 mod 4) */
387.Lprobeq:
388	/* Words are probably equal, but check for sure.
389	   If not, loop over the rest of string using normal algorithm.  */
390
391	bnone	a8, a4, .Leq	// if byte 0 is zero
392	l32r	a5, .Lmask1	// mask for byte 1
393	l32r	a6, .Lmask2	// mask for byte 2
394	bnone	a8, a5, .Leq	// if byte 1 is zero
395	l32r	a7, .Lmask3	// mask for byte 3
396	bnone	a8, a6, .Leq	// if byte 2 is zero
397	bnone	a8, a7, .Leq	// if byte 3 is zero
398	/* align (1 mod 4) */
399	addi.n	a2, a2, 12	// advance s1 pointer
400	addi.n	a3, a3, 4	// advance s2 pointer
401	/* align (1 mod 4) or (2 mod 4) */
4021:
403	loop	a0, .Lend	// loop forever (a4 is bigger than max iters)
404
405	l32i	a11, a2, 0	// get word from s1
406	l32i	a9, a3, 0	// get word from s2
407	addi	a2, a2, 4	// advance s1 pointer
408	bne	a11, a9, .Lwne
409	bnone	a11, a4, .Leq	// if byte 0 is zero
410	bnone	a11, a5, .Leq	// if byte 1 is zero
411	bnone	a11, a6, .Leq	// if byte 2 is zero
412	bnone	a11, a7, .Leq	// if byte 3 is zero
413	addi	a3, a3, 4	// advance s2 pointer
414.Lend:
415	j	1b
416
417	/* Words are equal; some byte is zero.  */
418.Leq:	movi	a2, 0		// return equal
419	leaf_return
420
421.Lwne2:	/* Words are not equal.  On big-endian processors, if none of the
422	   bytes are zero, the return value can be determined by a simple
423	   comparison.  */
424.Lwne:	/* Words are not equal.  */
425	xor	a2, a8, a9	// get word with nonzero in byte that differs
426	extui	a10, a8, 0, 8
427	extui	a11, a9, 0, 8
428	movi	a5, MASK1	// mask for byte 1
429	bany.w15	a2, a4, .Ldiff0	// if byte 0 differs
430
431	bnone.w15	a8, a4, .Leq	// if byte 0 is zero
432	movi	a6, MASK2	// mask for byte 2
433	bany.w15	a2, a5, .Ldiff1	// if byte 1 differs
434	extui	a10, a8, 24, 8
435	bnone.w15	a8, a5, .Leq	// if byte 1 is zero
436	extui	a11, a9, 24, 8
437	bany.w15	a2, a6, .Ldiff2	// if byte 2 differs
438	sub	a2, a10, a11
439	bnone.w15	a8, a6, .Leq	// if byte 2 is zero
440	/* Little-endian is a little more difficult because can't subtract
441	   whole words.  */
442.Ldiff3:
443	/* Bytes 0-2 are equal; byte 3 is different.
444	   For little-endian need to have a sign bit for the difference.  */
445	leaf_return
446.Ldiff0:
447	/* Byte 0 is different.  */
448	sub	a2, a10, a11
449	leaf_return
450
451.Ldiff1:
452	/* Byte 0 is equal; byte 1 is different.  */
453	extui	a10, a8, 8, 8
454	extui	a11, a9, 8, 8
455	sub	a2, a10, a11
456	leaf_return
457
458.Ldiff2:
459	/* Bytes 0-1 are equal; byte 2 is different.  */
460	extui	a10, a8, 16, 8
461	extui	a11, a9, 16, 8
462	sub	a2, a10, a11
463	leaf_return
464
465
466#else /* Not FLIX3 */
467	leaf_entry sp, 16
468	/* a2 = s1, a3 = s2 */
469
470	l8ui	a8, a2, 0	// byte 0 from s1
471	l8ui	a9, a3, 0	// byte 0 from s2
472	movi	a10, 3		// mask
473	bne	a8, a9, .Lretdiff
474
475	or	a11, a2, a3
476	bnone	a11, a10, .Laligned
477
478	xor	a11, a2, a3	// compare low two bits of s1 and s2
479	bany	a11, a10, .Lunaligned	// if they have different alignment
480
481	/* s1/s2 are not word-aligned.  */
482	addi	a2, a2, 1	// advance s1
483	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
484	addi	a3, a3, 1	// advance s2
485	bnone	a2, a10, .Laligned // if s1/s2 now aligned
486	l8ui	a8, a2, 0	// byte 1 from s1
487	l8ui	a9, a3, 0	// byte 1 from s2
488	addi	a2, a2, 1	// advance s1
489	bne	a8, a9, .Lretdiff // if different, return difference
490	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
491	addi	a3, a3, 1	// advance s2
492	bnone	a2, a10, .Laligned // if s1/s2 now aligned
493	l8ui	a8, a2, 0	// byte 2 from s1
494	l8ui	a9, a3, 0	// byte 2 from s2
495	addi	a2, a2, 1	// advance s1
496	bne	a8, a9, .Lretdiff // if different, return difference
497	beqz	a8, .Leq	// bytes equal, if zero, strings are equal
498	addi	a3, a3, 1	// advance s2
499	j	.Laligned
500
501/* s1 and s2 have different alignment.
502
503   If the zero-overhead loop option is available, use an (almost)
504   infinite zero-overhead loop with conditional exits so we only pay
505   for taken branches when exiting the loop.
506
507   Note: It is important for this unaligned case to come before the
508   code for aligned strings, because otherwise some of the branches
509   above cannot reach and have to be transformed to branches around
510   jumps.  The unaligned code is smaller and the branches can reach
511   over it.  */
512
513	.align	4
514#if XCHAL_HAVE_LOOPS
515#if XCHAL_HAVE_DENSITY
516	/* (2 mod 4) alignment for loop instruction */
517#else
518	/* (1 mod 4) alignment for loop instruction */
519	.byte	0
520	.byte	0
521#endif
522#endif
523.Lunaligned:
524#if XCHAL_HAVE_LOOPS
525#if XCHAL_HAVE_DENSITY
526	_movi.n	a8, 0		// set up for the maximum loop count
527#else
528	_movi	a8, 0		// set up for the maximum loop count
529#endif
530	loop	a8, .Lretdiff	// loop forever (almost anyway)
531#endif
532.Lnextbyte:
533	l8ui	a8, a2, 0
534	l8ui	a9, a3, 0
535	addi	a2, a2, 1
536	bne	a8, a9, .Lretdiff
537	addi	a3, a3, 1
538#if XCHAL_HAVE_LOOPS
539	beqz	a8, .Lretdiff
540#else
541	bnez	a8, .Lnextbyte
542#endif
543.Lretdiff:
544	sub	a2, a8, a9
545	leaf_return
546
547/* s1 is word-aligned; s2 is word-aligned.
548
549   If the zero-overhead loop option is available, use an (almost)
550   infinite zero-overhead loop with conditional exits so we only pay
551   for taken branches when exiting the loop.  */
552
553/* New algorithm, relying on the fact that all normal ASCII is between
554   32 and 127.
555
556   Rather than check all bytes for zero:
557   Take one word (4 bytes).  Call it w1.
558   Shift w1 left by one into w1'.
559   Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
560   Check that all 4 bit 6's (one for each byte) are one:
561   If they are, we are definitely not done.
562   If they are not, we are probably done, but need to check for zero.  */
563
564	.align	4
565#if XCHAL_HAVE_LOOPS
566#if !XCHAL_HAVE_L32R
567	/* (2 mod 4) alignment for loop instruction */
568	.byte	0
569	.byte	0
570#endif
571.Laligned:
572#if XCHAL_HAVE_L32R
573	l32r	a4, .Lmask0	// mask for byte 0
574	l32r	a7, .Lmask4
575#else
576	const16	a4, MASK0@h
577	const16	a4, MASK0@l
578	const16	a7, MASK4@h
579	const16	a7, MASK4@l
580#endif
581	/* Loop forever */
5821:
583	loop	a0, .Laligned_done
584
585	/* First unrolled loop body.  */
586	l32i	a8, a2, 0	// get word from s1
587	l32i	a9, a3, 0	// get word from s2
588	slli	a5, a8, 1
589	bne	a8, a9, .Lwne2
590	or	a9, a8, a5
591	bnall	a9, a7, .Lprobeq
592
593	/* Second unrolled loop body.  */
594	l32i	a8, a2, 4	// get word from s1+4
595	l32i	a9, a3, 4	// get word from s2+4
596	slli	a5, a8, 1
597	bne	a8, a9, .Lwne2
598	or	a9, a8, a5
599	bnall	a9, a7, .Lprobeq2
600
601	addi	a2, a2, 8	// advance s1 pointer
602	addi	a3, a3, 8	// advance s2 pointer
603.Laligned_done:
604	j     	1b
605
606.Lprobeq2:
607	/* Adjust pointers to account for the loop unrolling.  */
608	addi	a2, a2, 4
609	addi	a3, a3, 4
610
611#else /* !XCHAL_HAVE_LOOPS */
612
613.Laligned:
614	movi	a4, MASK0	// mask for byte 0
615	movi	a7, MASK4
616	j	.Lfirstword
617.Lnextword:
618	addi	a2, a2, 4	// advance s1 pointer
619	addi	a3, a3, 4	// advance s2 pointer
620.Lfirstword:
621	l32i	a8, a2, 0	// get word from s1
622	l32i	a9, a3, 0	// get word from s2
623	slli	a5, a8, 1
624	bne	a8, a9, .Lwne2
625	or	a9, a8, a5
626	ball	a9, a7, .Lnextword
627#endif /* !XCHAL_HAVE_LOOPS */
628
629	/* align (0 mod 4) */
630.Lprobeq:
631	/* Words are probably equal, but check for sure.
632	   If not, loop over the rest of string using normal algorithm.  */
633
634	bnone	a8, a4, .Leq	// if byte 0 is zero
635#if XCHAL_HAVE_L32R
636	l32r	a5, .Lmask1	// mask for byte 1
637	l32r	a6, .Lmask2	// mask for byte 2
638	bnone	a8, a5, .Leq	// if byte 1 is zero
639	l32r	a7, .Lmask3	// mask for byte 3
640	bnone	a8, a6, .Leq	// if byte 2 is zero
641	bnone	a8, a7, .Leq	// if byte 3 is zero
642	/* align (1 mod 4) */
643#else
644	const16	a5, MASK1@h	// mask for byte 1
645	const16	a5, MASK1@l
646	bnone	a8, a5, .Leq	// if byte 1 is zero
647	const16	a6, MASK2@h	// mask for byte 2
648	const16	a6, MASK2@l
649	bnone	a8, a6, .Leq	// if byte 2 is zero
650	const16	a7, MASK3@h	// mask for byte 3
651	const16	a7, MASK3@l
652	bnone	a8, a7, .Leq	// if byte 3 is zero
653	/* align (2 mod 4) */
654#endif /* XCHAL_HAVE_L32R */
655#if XCHAL_HAVE_DENSITY
656	addi.n	a2, a2, 4	// advance s1 pointer
657	addi.n	a3, a3, 4	// advance s2 pointer
658	/* align (1 mod 4) or (2 mod 4) */
659#else
660	addi	a2, a2, 4	// advance s1 pointer
661	addi	a3, a3, 4	// advance s2 pointer
662	or	a1, a1, a1	// nop
663#if !XCHAL_HAVE_L32R
664	or	a1, a1, a1	// nop
665#endif
666	/* align (2 mod 4) */
667#endif /* XCHAL_HAVE_DENSITY */
668#if XCHAL_HAVE_LOOPS
6691:
670	loop	a0, .Leq	// loop forever (a4 is bigger than max iters)
671	l32i	a8, a2, 0	// get word from s1
672	l32i	a9, a3, 0	// get word from s2
673	addi	a2, a2, 4	// advance s1 pointer
674	bne	a8, a9, .Lwne
675	bnone	a8, a4, .Leq	// if byte 0 is zero
676	bnone	a8, a5, .Leq	// if byte 1 is zero
677	bnone	a8, a6, .Leq	// if byte 2 is zero
678	bnone	a8, a7, .Leq	// if byte 3 is zero
679	addi	a3, a3, 4	// advance s2 pointer
680	j	1b
681#else /* !XCHAL_HAVE_LOOPS */
682
683	j	.Lfirstword2
684.Lnextword2:
685	addi	a3, a3, 4	// advance s2 pointer
686.Lfirstword2:
687	l32i	a8, a2, 0	// get word from s1
688	l32i	a9, a3, 0	// get word from s2
689	addi	a2, a2, 4	// advance s1 pointer
690	bne	a8, a9, .Lwne
691	bnone	a8, a4, .Leq	// if byte 0 is zero
692	bnone	a8, a5, .Leq	// if byte 1 is zero
693	bnone	a8, a6, .Leq	// if byte 2 is zero
694	bany	a8, a7, .Lnextword2	// if byte 3 is zero
695#endif /* !XCHAL_HAVE_LOOPS */
696
697	/* Words are equal; some byte is zero.  */
698.Leq:	movi	a2, 0		// return equal
699	leaf_return
700
701.Lwne2:	/* Words are not equal.  On big-endian processors, if none of the
702	   bytes are zero, the return value can be determined by a simple
703	   comparison.  */
704#ifdef __XTENSA_EB__
705	or	a10, a8, a5
706	bnall	a10, a7, .Lsomezero
707	bgeu	a8, a9, .Lposreturn
708	movi	a2, -1
709	leaf_return
710.Lposreturn:
711	movi	a2, 1
712	leaf_return
713.Lsomezero:	// There is probably some zero byte.
714#endif /* __XTENSA_EB__ */
715.Lwne:	/* Words are not equal.  */
716	xor	a2, a8, a9	// get word with nonzero in byte that differs
717	bany	a2, a4, .Ldiff0	// if byte 0 differs
718	movi	a5, MASK1	// mask for byte 1
719	bnone	a8, a4, .Leq	// if byte 0 is zero
720	bany	a2, a5, .Ldiff1	// if byte 1 differs
721	movi	a6, MASK2	// mask for byte 2
722	bnone	a8, a5, .Leq	// if byte 1 is zero
723	bany	a2, a6, .Ldiff2	// if byte 2 differs
724	bnone	a8, a6, .Leq	// if byte 2 is zero
725#ifdef __XTENSA_EB__
726.Ldiff3:
727.Ldiff2:
728.Ldiff1:
729	/* Byte 0 is equal (at least) and there is a difference before a zero
730	   byte.  Just subtract words to get the return value.
731	   The high order equal bytes cancel, leaving room for the sign.  */
732	sub	a2, a8, a9
733	leaf_return
734
735.Ldiff0:
736	/* Need to make room for the sign, so can't subtract whole words.  */
737	extui	a10, a8, 24, 8
738	extui	a11, a9, 24, 8
739	sub	a2, a10, a11
740	leaf_return
741
742#else /* !__XTENSA_EB__ */
743	/* Little-endian is a little more difficult because can't subtract
744	   whole words.  */
745.Ldiff3:
746	/* Bytes 0-2 are equal; byte 3 is different.
747	   For little-endian need to have a sign bit for the difference.  */
748	extui	a10, a8, 24, 8
749	extui	a11, a9, 24, 8
750	sub	a2, a10, a11
751	leaf_return
752
753.Ldiff0:
754	/* Byte 0 is different.  */
755	extui	a10, a8, 0, 8
756	extui	a11, a9, 0, 8
757	sub	a2, a10, a11
758	leaf_return
759
760.Ldiff1:
761	/* Byte 0 is equal; byte 1 is different.  */
762	extui	a10, a8, 8, 8
763	extui	a11, a9, 8, 8
764	sub	a2, a10, a11
765	leaf_return
766
767.Ldiff2:
768	/* Bytes 0-1 are equal; byte 2 is different.  */
769	extui	a10, a8, 16, 8
770	extui	a11, a9, 16, 8
771	sub	a2, a10, a11
772	leaf_return
773
774#endif /* !__XTENSA_EB */
775#endif /* FLIX3*/
776#endif /* FLIX3*/
777
778	.size	strcmp, . - strcmp
779