1/*
2 * Copyright (c) 2012-2014 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29	/* Implementation of strcmp for ARMv7 when DSP instructions are
30	   available.  Use ldrd to support wider loads, provided the data
31	   is sufficiently aligned.  Use saturating arithmetic to optimize
32	   the compares.  */
33
34	/* Build Options:
35	   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
36	   byte in the string.  If comparing completely random strings
37	   the pre-check will save time, since there is a very high
38	   probability of a mismatch in the first character: we save
39	   significant overhead if this is the common case.  However,
40	   if strings are likely to be identical (eg because we're
41	   verifying a hit in a hash table), then this check is largely
42	   redundant.  */
43
44	/* This version uses Thumb-2 code.  */
45#include <picolibc.h>
46
47	.thumb
48	.syntax unified
49
50#include "arm_asm.h"
51
52/* Parameters and result.  */
53#define src1		r0
54#define src2		r1
55#define result		r0	/* Overlaps src1.  */
56
57/* Internal variables.  */
58#define tmp1		r4
59#define tmp2		r5
60#define const_m1	r12
61
62/* Additional internal variables for 64-bit aligned data.  */
63#define data1a		r2
64#define data1b		r3
65#define data2a		r6
66#define data2b		r7
67#define syndrome_a	tmp1
68#define syndrome_b	tmp2
69
70/* Additional internal variables for 32-bit aligned data.  */
71#define data1		r2
72#define data2		r3
73#define syndrome	tmp2
74
75
76	/* Macro to compute and return the result value for word-aligned
77	   cases.  */
78	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
79#ifdef __ARM_BIG_ENDIAN
80	/* If data1 contains a zero byte, then syndrome will contain a 1 in
81	   bit 7 of that byte.  Otherwise, the highest set bit in the
82	   syndrome will highlight the first different bit.  It is therefore
83	   sufficient to extract the eight bits starting with the syndrome
84	   bit.  */
85	clz	tmp1, \synd
86	lsl	r1, \d2, tmp1
87	.if \restore_r6
88	ldrd	r6, r7, [sp, #8]
89	.endif
90	.cfi_restore 6
91	.cfi_restore 7
92	lsl	\d1, \d1, tmp1
93	.cfi_remember_state
94	lsr	result, \d1, #24
95	ldrd	r4, r5, [sp], #16
96	.cfi_restore 4
97	.cfi_restore 5
98	.cfi_adjust_cfa_offset -16
99	sub	result, result, r1, lsr #24
100	epilogue push_ip=HAVE_PAC_LEAF
101#else
102	/* To use the big-endian trick we'd have to reverse all three words.
103	   that's slower than this approach.  */
104	rev	\synd, \synd
105	clz	tmp1, \synd
106	bic	tmp1, tmp1, #7
107	lsr	r1, \d2, tmp1
108	.cfi_remember_state
109	.if \restore_r6
110	ldrd	r6, r7, [sp, #8]
111	.endif
112	.cfi_restore 6
113	.cfi_restore 7
114	lsr	\d1, \d1, tmp1
115	and	result, \d1, #255
116	and	r1, r1, #255
117	ldrd	r4, r5, [sp], #16
118	.cfi_restore 4
119	.cfi_restore 5
120	.cfi_adjust_cfa_offset -16
121	sub	result, result, r1
122
123	epilogue push_ip=HAVE_PAC_LEAF
124#endif
125	.endm
126
127	.text
128	.p2align	5
129def_fn	strcmp
130	.fnstart
131	.cfi_sections .debug_frame
132	.cfi_startproc
133	prologue push_ip=HAVE_PAC_LEAF
134#ifndef STRCMP_NO_PRECHECK
135	ldrb	r2, [src1]
136	ldrb	r3, [src2]
137	cmp	r2, #1
138	it	cs
139	cmpcs	r2, r3
140	bne	.Lfastpath_exit
141#endif
142	strd	r4, r5, [sp, #-16]!
143	.cfi_adjust_cfa_offset 16
144	.cfi_rel_offset 4, 0
145	.cfi_rel_offset 5, 4
146	orr	tmp1, src1, src2
147	strd	r6, r7, [sp, #8]
148	.cfi_rel_offset 6, 8
149	.cfi_rel_offset 7, 12
150	mvn	const_m1, #0
151	lsl	r2, tmp1, #29
152	cbz	r2, .Lloop_aligned8
153
154.Lnot_aligned:
155	eor	tmp1, src1, src2
156	tst	tmp1, #7
157	bne	.Lmisaligned8
158
159	/* Deal with mutual misalignment by aligning downwards and then
160	   masking off the unwanted loaded data to prevent a difference.  */
161	and	tmp1, src1, #7
162	bic	src1, src1, #7
163	and	tmp2, tmp1, #3
164	bic	src2, src2, #7
165	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
166	ldrd	data1a, data1b, [src1], #16
167	tst	tmp1, #4
168	ldrd	data2a, data2b, [src2], #16
169	/* In thumb code we can't use MVN with a register shift, but
170	   we do have ORN.  */
171	S2HI	tmp1, const_m1, tmp2
172	orn	data1a, data1a, tmp1
173	orn	data2a, data2a, tmp1
174	beq	.Lstart_realigned8
175	orn	data1b, data1b, tmp1
176	mov	data1a, const_m1
177	orn	data2b, data2b, tmp1
178	mov	data2a, const_m1
179	b	.Lstart_realigned8
180
181	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
182	   pass.  */
183	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
184	.p2align 2	/* Always word aligned.  */
185.Lloop_aligned8:
186	ldrd	data1a, data1b, [src1], #16
187	ldrd	data2a, data2b, [src2], #16
188.Lstart_realigned8:
189	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
190	eor	syndrome_a, data1a, data2a
191	sel	syndrome_a, syndrome_a, const_m1
192	cbnz	syndrome_a, .Ldiff_in_a
193	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
194	eor	syndrome_b, data1b, data2b
195	sel	syndrome_b, syndrome_b, const_m1
196	cbnz	syndrome_b, .Ldiff_in_b
197
198	ldrd	data1a, data1b, [src1, #-8]
199	ldrd	data2a, data2b, [src2, #-8]
200	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
201	eor	syndrome_a, data1a, data2a
202	sel	syndrome_a, syndrome_a, const_m1
203	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
204	eor	syndrome_b, data1b, data2b
205	sel	syndrome_b, syndrome_b, const_m1
206	/* Can't use CBZ for backwards branch.  */
207	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
208	beq	.Lloop_aligned8
209
210.Ldiff_found:
211	cbnz	syndrome_a, .Ldiff_in_a
212
213.Ldiff_in_b:
214	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
215
216.Ldiff_in_a:
217	.cfi_restore_state
218	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
219
220	.cfi_restore_state
221.Lmisaligned8:
222	tst	tmp1, #3
223	bne	.Lmisaligned4
224	ands	tmp1, src1, #3
225	bne	.Lmutual_align4
226
227	/* Unrolled by a factor of 2, to reduce the number of post-increment
228	   operations.  */
229.Lloop_aligned4:
230	ldr	data1, [src1], #8
231	ldr	data2, [src2], #8
232.Lstart_realigned4:
233	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
234	eor	syndrome, data1, data2
235	sel	syndrome, syndrome, const_m1
236	cbnz	syndrome, .Laligned4_done
237	ldr	data1, [src1, #-4]
238	ldr	data2, [src2, #-4]
239	uadd8	syndrome, data1, const_m1
240	eor	syndrome, data1, data2
241	sel	syndrome, syndrome, const_m1
242	cmp	syndrome, #0
243	beq	.Lloop_aligned4
244
245.Laligned4_done:
246	strcmp_epilogue_aligned syndrome, data1, data2, 0
247
248.Lmutual_align4:
249	.cfi_restore_state
250	/* Deal with mutual misalignment by aligning downwards and then
251	   masking off the unwanted loaded data to prevent a difference.  */
252	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
253	bic	src1, src1, #3
254	ldr	data1, [src1], #8
255	bic	src2, src2, #3
256	ldr	data2, [src2], #8
257
258	/* In thumb code we can't use MVN with a register shift, but
259	   we do have ORN.  */
260	S2HI	tmp1, const_m1, tmp1
261	orn	data1, data1, tmp1
262	orn	data2, data2, tmp1
263	b	.Lstart_realigned4
264
265.Lmisaligned4:
266	ands	tmp1, src1, #3
267	beq	.Lsrc1_aligned
268	sub	src2, src2, tmp1
269	bic	src1, src1, #3
270	lsls	tmp1, tmp1, #31
271	ldr	data1, [src1], #4
272	beq	.Laligned_m2
273	bcs	.Laligned_m1
274
275#ifdef STRCMP_NO_PRECHECK
276	ldrb	data2, [src2, #1]
277	uxtb	tmp1, data1, ror #BYTE1_OFFSET
278	subs	tmp1, tmp1, data2
279	bne	.Lmisaligned_exit
280	cbz	data2, .Lmisaligned_exit
281
282.Laligned_m2:
283	ldrb	data2, [src2, #2]
284	uxtb	tmp1, data1, ror #BYTE2_OFFSET
285	subs	tmp1, tmp1, data2
286	bne	.Lmisaligned_exit
287	cbz	data2, .Lmisaligned_exit
288
289.Laligned_m1:
290	ldrb	data2, [src2, #3]
291	uxtb	tmp1, data1, ror #BYTE3_OFFSET
292	subs	tmp1, tmp1, data2
293	bne	.Lmisaligned_exit
294	add	src2, src2, #4
295	cbnz	data2, .Lsrc1_aligned
296#else  /* STRCMP_NO_PRECHECK */
297	/* If we've done the pre-check, then we don't need to check the
298	   first byte again here.  */
299	ldrb	data2, [src2, #2]
300	uxtb	tmp1, data1, ror #BYTE2_OFFSET
301	subs	tmp1, tmp1, data2
302	bne	.Lmisaligned_exit
303	cbz	data2, .Lmisaligned_exit
304
305.Laligned_m2:
306	ldrb	data2, [src2, #3]
307	uxtb	tmp1, data1, ror #BYTE3_OFFSET
308	subs	tmp1, tmp1, data2
309	bne	.Lmisaligned_exit
310	cbnz	data2, .Laligned_m1
311#endif
312
313.Lmisaligned_exit:
314	.cfi_remember_state
315	mov	result, tmp1
316	ldr	r4, [sp], #16
317	.cfi_restore 4
318	.cfi_adjust_cfa_offset -16
319	epilogue push_ip=HAVE_PAC_LEAF
320
321#ifndef STRCMP_NO_PRECHECK
322.Lfastpath_exit:
323	.cfi_restore_state
324	.cfi_remember_state
325	sub	r0, r2, r3
326	epilogue push_ip=HAVE_PAC_LEAF
327
328.Laligned_m1:
329	.cfi_restore_state
330	.cfi_remember_state
331	add	src2, src2, #4
332#endif
333.Lsrc1_aligned:
334	.cfi_restore_state
335	/* src1 is word aligned, but src2 has no common alignment
336	   with it.  */
337	ldr	data1, [src1], #4
338	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
339
340	bic	src2, src2, #3
341	ldr	data2, [src2], #4
342	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
343	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
344
345	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
346.Loverlap3:
347	bic	tmp1, data1, #MSB
348	uadd8	syndrome, data1, const_m1
349	eors	syndrome, tmp1, data2, S2LO #8
350	sel	syndrome, syndrome, const_m1
351	bne	4f
352	cbnz	syndrome, 5f
353	ldr	data2, [src2], #4
354	eor	tmp1, tmp1, data1
355	cmp	tmp1, data2, S2HI #24
356	bne	6f
357	ldr	data1, [src1], #4
358	b	.Loverlap3
3594:
360	S2LO	data2, data2, #8
361	b	.Lstrcmp_tail
362
3635:
364	bics	syndrome, syndrome, #MSB
365	bne	.Lstrcmp_done_equal
366
367	/* We can only get here if the MSB of data1 contains 0, so
368	   fast-path the exit.  */
369	ldrb	result, [src2]
370	.cfi_remember_state
371	ldrd	r4, r5, [sp], #16
372	.cfi_restore 4
373	.cfi_restore 5
374	/* R6/7 Not used in this sequence.  */
375	.cfi_restore 6
376	.cfi_restore 7
377	.cfi_adjust_cfa_offset -16
378	neg	result, result
379	epilogue push_ip=HAVE_PAC_LEAF
380
3816:
382	.cfi_restore_state
383	S2LO	data1, data1, #24
384	and	data2, data2, #LSB
385	b	.Lstrcmp_tail
386
387	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
388.Loverlap2:
389	and	tmp1, data1, const_m1, S2LO #16
390	uadd8	syndrome, data1, const_m1
391	eors	syndrome, tmp1, data2, S2LO #16
392	sel	syndrome, syndrome, const_m1
393	bne	4f
394	cbnz	syndrome, 5f
395	ldr	data2, [src2], #4
396	eor	tmp1, tmp1, data1
397	cmp	tmp1, data2, S2HI #16
398	bne	6f
399	ldr	data1, [src1], #4
400	b	.Loverlap2
4014:
402	S2LO	data2, data2, #16
403	b	.Lstrcmp_tail
4045:
405	ands	syndrome, syndrome, const_m1, S2LO #16
406	bne	.Lstrcmp_done_equal
407
408	ldrh	data2, [src2]
409	S2LO	data1, data1, #16
410#ifdef __ARM_BIG_ENDIAN
411	lsl	data2, data2, #16
412#endif
413	b	.Lstrcmp_tail
414
4156:
416	S2LO	data1, data1, #16
417	and	data2, data2, const_m1, S2LO #16
418	b	.Lstrcmp_tail
419
420	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
421.Loverlap1:
422	and	tmp1, data1, #LSB
423	uadd8	syndrome, data1, const_m1
424	eors	syndrome, tmp1, data2, S2LO #24
425	sel	syndrome, syndrome, const_m1
426	bne	4f
427	cbnz	syndrome, 5f
428	ldr	data2, [src2], #4
429	eor	tmp1, tmp1, data1
430	cmp	tmp1, data2, S2HI #8
431	bne	6f
432	ldr	data1, [src1], #4
433	b	.Loverlap1
4344:
435	S2LO	data2, data2, #24
436	b	.Lstrcmp_tail
4375:
438	tst	syndrome, #LSB
439	bne	.Lstrcmp_done_equal
440	ldr	data2, [src2]
4416:
442	S2LO	data1, data1, #8
443	bic	data2, data2, #MSB
444	b	.Lstrcmp_tail
445
446.Lstrcmp_done_equal:
447	mov	result, #0
448	.cfi_remember_state
449	ldrd	r4, r5, [sp], #16
450	.cfi_restore 4
451	.cfi_restore 5
452	/* R6/7 not used in this sequence.  */
453	.cfi_restore 6
454	.cfi_restore 7
455	.cfi_adjust_cfa_offset -16
456	epilogue push_ip=HAVE_PAC_LEAF
457
458.Lstrcmp_tail:
459	.cfi_restore_state
460#ifndef __ARM_BIG_ENDIAN
461	rev	data1, data1
462	rev	data2, data2
463	/* Now everything looks big-endian...  */
464#endif
465	uadd8	tmp1, data1, const_m1
466	eor	tmp1, data1, data2
467	sel	syndrome, tmp1, const_m1
468	clz	tmp1, syndrome
469	lsl	data1, data1, tmp1
470	lsl	data2, data2, tmp1
471	lsr	result, data1, #24
472	ldrd	r4, r5, [sp], #16
473	.cfi_restore 4
474	.cfi_restore 5
475	/* R6/7 not used in this sequence.  */
476	.cfi_restore 6
477	.cfi_restore 7
478	.cfi_adjust_cfa_offset -16
479	sub	result, result, data2, lsr #24
480	epilogue push_ip=HAVE_PAC_LEAF
481	.cfi_endproc
482	.cantunwind
483	.fnend
484	.size strcmp, . - strcmp
485