1/*
2 * Copyright (c) 2012-2014 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29	/* Implementation of strcmp for ARMv6.  Use ldrd to support wider
30	   loads, provided the data is sufficiently aligned.  Use
31	   saturating arithmetic to optimize the compares.  */
32
33	/* Build Options:
34	   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
35	   byte in the string.  If comparing completely random strings
36	   the pre-check will save time, since there is a very high
37	   probability of a mismatch in the first character: we save
38	   significant overhead if this is the common case.  However,
39	   if strings are likely to be identical (eg because we're
40	   verifying a hit in a hash table), then this check is largely
41	   redundant.  */
42
43#include <picolibc.h>
44
45	.arm
46
47/* Parameters and result.  */
48#define src1		r0
49#define src2		r1
50#define result		r0	/* Overlaps src1.  */
51
52/* Internal variables.  */
53#define tmp1		r4
54#define tmp2		r5
55#define const_m1	r12
56
57/* Additional internal variables for 64-bit aligned data.  */
58#define data1a		r2
59#define data1b		r3
60#define data2a		r6
61#define data2b		r7
62#define syndrome_a	tmp1
63#define syndrome_b	tmp2
64
65/* Additional internal variables for 32-bit aligned data.  */
66#define data1		r2
67#define data2		r3
68#define syndrome	tmp2
69
70
71	/* Macro to compute and return the result value for word-aligned
72	   cases.  */
73	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
74#ifdef __ARM_BIG_ENDIAN
75	/* If data1 contains a zero byte, then syndrome will contain a 1 in
76	   bit 7 of that byte.  Otherwise, the highest set bit in the
77	   syndrome will highlight the first different bit.  It is therefore
78	   sufficient to extract the eight bits starting with the syndrome
79	   bit.  */
80	clz	tmp1, \synd
81	lsl	r1, \d2, tmp1
82	.if \restore_r6
83	ldrd	r6, r7, [sp, #8]
84	.endif
85	.cfi_restore 6
86	.cfi_restore 7
87	lsl	\d1, \d1, tmp1
88	.cfi_remember_state
89	lsr	result, \d1, #24
90	ldrd	r4, r5, [sp], #16
91	.cfi_restore 4
92	.cfi_restore 5
93	sub	result, result, r1, lsr #24
94	bx	lr
95#else
96	/* To use the big-endian trick we'd have to reverse all three words.
97	   that's slower than this approach.  */
98	rev	\synd, \synd
99	clz	tmp1, \synd
100	bic	tmp1, tmp1, #7
101	lsr	r1, \d2, tmp1
102	.cfi_remember_state
103	.if \restore_r6
104	ldrd	r6, r7, [sp, #8]
105	.endif
106	.cfi_restore 6
107	.cfi_restore 7
108	lsr	\d1, \d1, tmp1
109	and	result, \d1, #255
110	and	r1, r1, #255
111	ldrd	r4, r5, [sp], #16
112	.cfi_restore 4
113	.cfi_restore 5
114	sub	result, result, r1
115
116	bx	lr
117#endif
118	.endm
119
120	.text
121	.p2align	5
122.Lstrcmp_start_addr:
123#ifndef STRCMP_NO_PRECHECK
124.Lfastpath_exit:
125	sub	r0, r2, r3
126	bx	lr
127#endif
128def_fn	strcmp
129#ifndef STRCMP_NO_PRECHECK
130	ldrb	r2, [src1]
131	ldrb	r3, [src2]
132	cmp	r2, #1
133	cmpcs	r2, r3
134	bne	.Lfastpath_exit
135#endif
136	.cfi_sections .debug_frame
137	.cfi_startproc
138	strd	r4, r5, [sp, #-16]!
139	.cfi_def_cfa_offset 16
140	.cfi_offset 4, -16
141	.cfi_offset 5, -12
142	orr	tmp1, src1, src2
143	strd	r6, r7, [sp, #8]
144	.cfi_offset 6, -8
145	.cfi_offset 7, -4
146	mvn	const_m1, #0
147	tst	tmp1, #7
148	beq	.Lloop_aligned8
149
150.Lnot_aligned:
151	eor	tmp1, src1, src2
152	tst	tmp1, #7
153	bne	.Lmisaligned8
154
155	/* Deal with mutual misalignment by aligning downwards and then
156	   masking off the unwanted loaded data to prevent a difference.  */
157	and	tmp1, src1, #7
158	bic	src1, src1, #7
159	and	tmp2, tmp1, #3
160	bic	src2, src2, #7
161	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
162	ldrd	data1a, data1b, [src1], #16
163	tst	tmp1, #4
164	ldrd	data2a, data2b, [src2], #16
165	/* In ARM code we can't use ORN, but with do have MVN with a
166	   register shift.  */
167	mvn	tmp1, const_m1, S2HI tmp2
168	orr	data1a, data1a, tmp1
169	orr	data2a, data2a, tmp1
170	beq	.Lstart_realigned8
171	orr	data1b, data1b, tmp1
172	mov	data1a, const_m1
173	orr	data2b, data2b, tmp1
174	mov	data2a, const_m1
175	b	.Lstart_realigned8
176
177	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
178	   pass.  */
179	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
180	.p2align 2	/* Always word aligned.  */
181.Lloop_aligned8:
182	ldrd	data1a, data1b, [src1], #16
183	ldrd	data2a, data2b, [src2], #16
184.Lstart_realigned8:
185	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
186	eor	syndrome_a, data1a, data2a
187	sel	syndrome_a, syndrome_a, const_m1
188	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
189	eor	syndrome_b, data1b, data2b
190	sel	syndrome_b, syndrome_b, const_m1
191	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
192	bne	.Ldiff_found
193
194	ldrd	data1a, data1b, [src1, #-8]
195	ldrd	data2a, data2b, [src2, #-8]
196	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
197	eor	syndrome_a, data1a, data2a
198	sel	syndrome_a, syndrome_a, const_m1
199	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
200	eor	syndrome_b, data1b, data2b
201	sel	syndrome_b, syndrome_b, const_m1
202	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
203	beq	.Lloop_aligned8
204
205.Ldiff_found:
206	cmp	syndrome_a, #0
207	bne	.Ldiff_in_a
208
209.Ldiff_in_b:
210	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
211
212.Ldiff_in_a:
213	.cfi_restore_state
214	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
215
216	.cfi_restore_state
217.Lmisaligned8:
218	tst	tmp1, #3
219	bne	.Lmisaligned4
220	ands	tmp1, src1, #3
221	bne	.Lmutual_align4
222
223	/* Unrolled by a factor of 2, to reduce the number of post-increment
224	   operations.  */
225.Lloop_aligned4:
226	ldr	data1, [src1], #8
227	ldr	data2, [src2], #8
228.Lstart_realigned4:
229	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
230	eor	syndrome, data1, data2
231	sel	syndrome, syndrome, const_m1
232	cmp	syndrome, #0
233	bne	.Laligned4_done
234
235	ldr	data1, [src1, #-4]
236	ldr	data2, [src2, #-4]
237	uadd8	syndrome, data1, const_m1
238	eor	syndrome, data1, data2
239	sel	syndrome, syndrome, const_m1
240	cmp	syndrome, #0
241	beq	.Lloop_aligned4
242
243.Laligned4_done:
244	strcmp_epilogue_aligned syndrome, data1, data2, 0
245
246.Lmutual_align4:
247	.cfi_restore_state
248	/* Deal with mutual misalignment by aligning downwards and then
249	   masking off the unwanted loaded data to prevent a difference.  */
250	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
251	bic	src1, src1, #3
252	ldr	data1, [src1], #8
253	bic	src2, src2, #3
254	ldr	data2, [src2], #8
255
256	/* In ARM code we can't use ORN, but with do have MVN with a
257	   register shift.  */
258	mvn	tmp1, const_m1, S2HI tmp1
259	orr	data1, data1, tmp1
260	orr	data2, data2, tmp1
261	b	.Lstart_realigned4
262
263.Lmisaligned4:
264	ands	tmp1, src1, #3
265	beq	.Lsrc1_aligned
266	sub	src2, src2, tmp1
267	bic	src1, src1, #3
268	lsls	tmp1, tmp1, #31
269	ldr	data1, [src1], #4
270	beq	.Laligned_m2
271	bcs	.Laligned_m1
272
273#ifdef STRCMP_NO_PRECHECK
274	ldrb	data2, [src2, #1]
275	uxtb	tmp1, data1, ror #BYTE1_OFFSET
276	cmp	tmp1, #1
277	cmpcs	tmp1, data2
278	bne	.Lmisaligned_exit
279
280.Laligned_m2:
281	ldrb	data2, [src2, #2]
282	uxtb	tmp1, data1, ror #BYTE2_OFFSET
283	cmp	tmp1, #1
284	cmpcs	tmp1, data2
285	bne	.Lmisaligned_exit
286
287.Laligned_m1:
288	ldrb	data2, [src2, #3]
289	uxtb	tmp1, data1, ror #BYTE3_OFFSET
290	cmp	tmp1, #1
291	cmpcs	tmp1, data2
292	beq	.Lsrc1_aligned
293
294#else  /* STRCMP_NO_PRECHECK */
295	/* If we've done the pre-check, then we don't need to check the
296	   first byte again here.  */
297	ldrb	data2, [src2, #2]
298	uxtb	tmp1, data1, ror #BYTE2_OFFSET
299	cmp	tmp1, #1
300	cmpcs	tmp1, data2
301	bne	.Lmisaligned_exit
302
303.Laligned_m2:
304	ldrb	data2, [src2, #3]
305	uxtb	tmp1, data1, ror #BYTE3_OFFSET
306	cmp	tmp1, #1
307	cmpcs	tmp1, data2
308	beq	.Laligned_m1
309#endif
310
311.Lmisaligned_exit:
312	.cfi_remember_state
313	sub	result, tmp1, data2
314	ldr	r4, [sp], #16
315	.cfi_restore 4
316	bx	lr
317
318#ifndef STRCMP_NO_PRECHECK
319.Laligned_m1:
320	add	src2, src2, #4
321#endif
322.Lsrc1_aligned:
323	.cfi_restore_state
324	/* src1 is word aligned, but src2 has no common alignment
325	   with it.  */
326	ldr	data1, [src1], #4
327	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
328
329	bic	src2, src2, #3
330	ldr	data2, [src2], #4
331	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
332	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
333
334	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
335.Loverlap3:
336	bic	tmp1, data1, #MSB
337	uadd8	syndrome, data1, const_m1
338	eors	syndrome, tmp1, data2, S2LO #8
339	sel	syndrome, syndrome, const_m1
340	bne	4f
341	cmp	syndrome, #0
342	ldreq	data2, [src2], #4
343	bne	5f
344
345	eor	tmp1, tmp1, data1
346	cmp	tmp1, data2, S2HI #24
347	bne	6f
348	ldr	data1, [src1], #4
349	b	.Loverlap3
3504:
351	S2LO	data2, data2, #8
352	b	.Lstrcmp_tail
353
3545:
355	bics	syndrome, syndrome, #MSB
356	bne	.Lstrcmp_done_equal
357
358	/* We can only get here if the MSB of data1 contains 0, so
359	   fast-path the exit.  */
360	ldrb	result, [src2]
361	.cfi_remember_state
362	ldrd	r4, r5, [sp], #16
363	.cfi_restore 4
364	.cfi_restore 5
365	/* R6/7 Not used in this sequence.  */
366	.cfi_restore 6
367	.cfi_restore 7
368	neg	result, result
369	bx	lr
370
3716:
372	.cfi_restore_state
373	S2LO	data1, data1, #24
374	and	data2, data2, #LSB
375	b	.Lstrcmp_tail
376
377	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
378.Loverlap2:
379	and	tmp1, data1, const_m1, S2LO #16
380	uadd8	syndrome, data1, const_m1
381	eors	syndrome, tmp1, data2, S2LO #16
382	sel	syndrome, syndrome, const_m1
383	bne	4f
384	cmp	syndrome, #0
385	ldreq	data2, [src2], #4
386	bne	5f
387	eor	tmp1, tmp1, data1
388	cmp	tmp1, data2, S2HI #16
389	bne	6f
390	ldr	data1, [src1], #4
391	b	.Loverlap2
3924:
393	S2LO	data2, data2, #16
394	b	.Lstrcmp_tail
3955:
396	ands	syndrome, syndrome, const_m1, S2LO #16
397	bne	.Lstrcmp_done_equal
398
399	ldrh	data2, [src2]
400	S2LO	data1, data1, #16
401#ifdef __ARM_BIG_ENDIAN
402	lsl	data2, data2, #16
403#endif
404	b	.Lstrcmp_tail
405
4066:
407	S2LO	data1, data1, #16
408	and	data2, data2, const_m1, S2LO #16
409	b	.Lstrcmp_tail
410
411	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
412.Loverlap1:
413	and	tmp1, data1, #LSB
414	uadd8	syndrome, data1, const_m1
415	eors	syndrome, tmp1, data2, S2LO #24
416	sel	syndrome, syndrome, const_m1
417	bne	4f
418	cmp	syndrome, #0
419	ldreq	data2, [src2], #4
420	bne	5f
421	eor	tmp1, tmp1, data1
422	cmp	tmp1, data2, S2HI #8
423	bne	6f
424	ldr	data1, [src1], #4
425	b	.Loverlap1
4264:
427	S2LO	data2, data2, #24
428	b	.Lstrcmp_tail
4295:
430	tst	syndrome, #LSB
431	bne	.Lstrcmp_done_equal
432	ldr	data2, [src2]
4336:
434	S2LO	data1, data1, #8
435	bic	data2, data2, #MSB
436	b	.Lstrcmp_tail
437
438.Lstrcmp_done_equal:
439	mov	result, #0
440	.cfi_remember_state
441	ldrd	r4, r5, [sp], #16
442	.cfi_restore 4
443	.cfi_restore 5
444	/* R6/7 not used in this sequence.  */
445	.cfi_restore 6
446	.cfi_restore 7
447	bx	lr
448
449.Lstrcmp_tail:
450	.cfi_restore_state
451#ifndef __ARM_BIG_ENDIAN
452	rev	data1, data1
453	rev	data2, data2
454	/* Now everything looks big-endian...  */
455#endif
456	uadd8	tmp1, data1, const_m1
457	eor	tmp1, data1, data2
458	sel	syndrome, tmp1, const_m1
459	clz	tmp1, syndrome
460	lsl	data1, data1, tmp1
461	lsl	data2, data2, tmp1
462	lsr	result, data1, #24
463	ldrd	r4, r5, [sp], #16
464	.cfi_restore 4
465	.cfi_restore 5
466	/* R6/7 not used in this sequence.  */
467	.cfi_restore 6
468	.cfi_restore 7
469	sub	result, result, data2, lsr #24
470	bx	lr
471	.cfi_endproc
472	.size strcmp, . - .Lstrcmp_start_addr
473