1/*
2 * Copyright (c) 2012-2014 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29	/* Implementation of strcmp for ARMv7 when DSP instructions are
30	   available.  Use ldrd to support wider loads, provided the data
31	   is sufficiently aligned.  Use saturating arithmetic to optimize
32	   the compares.  */
33
34	/* Build Options:
35	   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
36	   byte in the string.  If comparing completely random strings
37	   the pre-check will save time, since there is a very high
38	   probability of a mismatch in the first character: we save
39	   significant overhead if this is the common case.  However,
40	   if strings are likely to be identical (eg because we're
41	   verifying a hit in a hash table), then this check is largely
42	   redundant.  */
43
44	/* This version uses Thumb-2 code.  */
45	.thumb
46	.syntax unified
47
48#include "arm_asm.h"
49
50/* Parameters and result.  */
51#define src1		r0
52#define src2		r1
53#define result		r0	/* Overlaps src1.  */
54
55/* Internal variables.  */
56#define tmp1		r4
57#define tmp2		r5
58#define const_m1	r12
59
60/* Additional internal variables for 64-bit aligned data.  */
61#define data1a		r2
62#define data1b		r3
63#define data2a		r6
64#define data2b		r7
65#define syndrome_a	tmp1
66#define syndrome_b	tmp2
67
68/* Additional internal variables for 32-bit aligned data.  */
69#define data1		r2
70#define data2		r3
71#define syndrome	tmp2
72
73
74	/* Macro to compute and return the result value for word-aligned
75	   cases.  */
76	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
77#ifdef __ARM_BIG_ENDIAN
78	/* If data1 contains a zero byte, then syndrome will contain a 1 in
79	   bit 7 of that byte.  Otherwise, the highest set bit in the
80	   syndrome will highlight the first different bit.  It is therefore
81	   sufficient to extract the eight bits starting with the syndrome
82	   bit.  */
83	clz	tmp1, \synd
84	lsl	r1, \d2, tmp1
85	.if \restore_r6
86	ldrd	r6, r7, [sp, #8]
87	.endif
88	.cfi_restore 6
89	.cfi_restore 7
90	lsl	\d1, \d1, tmp1
91	.cfi_remember_state
92	lsr	result, \d1, #24
93	ldrd	r4, r5, [sp], #16
94	.cfi_restore 4
95	.cfi_restore 5
96	.cfi_adjust_cfa_offset -16
97	sub	result, result, r1, lsr #24
98	epilogue push_ip=HAVE_PAC_LEAF
99#else
100	/* To use the big-endian trick we'd have to reverse all three words.
101	   that's slower than this approach.  */
102	rev	\synd, \synd
103	clz	tmp1, \synd
104	bic	tmp1, tmp1, #7
105	lsr	r1, \d2, tmp1
106	.cfi_remember_state
107	.if \restore_r6
108	ldrd	r6, r7, [sp, #8]
109	.endif
110	.cfi_restore 6
111	.cfi_restore 7
112	lsr	\d1, \d1, tmp1
113	and	result, \d1, #255
114	and	r1, r1, #255
115	ldrd	r4, r5, [sp], #16
116	.cfi_restore 4
117	.cfi_restore 5
118	.cfi_adjust_cfa_offset -16
119	sub	result, result, r1
120
121	epilogue push_ip=HAVE_PAC_LEAF
122#endif
123	.endm
124
125	.text
126	.p2align	5
127def_fn	strcmp
128	.fnstart
129	.cfi_sections .debug_frame
130	.cfi_startproc
131	prologue push_ip=HAVE_PAC_LEAF
132#ifndef STRCMP_NO_PRECHECK
133	ldrb	r2, [src1]
134	ldrb	r3, [src2]
135	cmp	r2, #1
136	it	cs
137	cmpcs	r2, r3
138	bne	.Lfastpath_exit
139#endif
140	strd	r4, r5, [sp, #-16]!
141	.cfi_adjust_cfa_offset 16
142	.cfi_rel_offset 4, 0
143	.cfi_rel_offset 5, 4
144	orr	tmp1, src1, src2
145	strd	r6, r7, [sp, #8]
146	.cfi_rel_offset 6, 8
147	.cfi_rel_offset 7, 12
148	mvn	const_m1, #0
149	lsl	r2, tmp1, #29
150	cbz	r2, .Lloop_aligned8
151
152.Lnot_aligned:
153	eor	tmp1, src1, src2
154	tst	tmp1, #7
155	bne	.Lmisaligned8
156
157	/* Deal with mutual misalignment by aligning downwards and then
158	   masking off the unwanted loaded data to prevent a difference.  */
159	and	tmp1, src1, #7
160	bic	src1, src1, #7
161	and	tmp2, tmp1, #3
162	bic	src2, src2, #7
163	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
164	ldrd	data1a, data1b, [src1], #16
165	tst	tmp1, #4
166	ldrd	data2a, data2b, [src2], #16
167	/* In thumb code we can't use MVN with a register shift, but
168	   we do have ORN.  */
169	S2HI	tmp1, const_m1, tmp2
170	orn	data1a, data1a, tmp1
171	orn	data2a, data2a, tmp1
172	beq	.Lstart_realigned8
173	orn	data1b, data1b, tmp1
174	mov	data1a, const_m1
175	orn	data2b, data2b, tmp1
176	mov	data2a, const_m1
177	b	.Lstart_realigned8
178
179	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
180	   pass.  */
181	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
182	.p2align 2	/* Always word aligned.  */
183.Lloop_aligned8:
184	ldrd	data1a, data1b, [src1], #16
185	ldrd	data2a, data2b, [src2], #16
186.Lstart_realigned8:
187	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
188	eor	syndrome_a, data1a, data2a
189	sel	syndrome_a, syndrome_a, const_m1
190	cbnz	syndrome_a, .Ldiff_in_a
191	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
192	eor	syndrome_b, data1b, data2b
193	sel	syndrome_b, syndrome_b, const_m1
194	cbnz	syndrome_b, .Ldiff_in_b
195
196	ldrd	data1a, data1b, [src1, #-8]
197	ldrd	data2a, data2b, [src2, #-8]
198	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
199	eor	syndrome_a, data1a, data2a
200	sel	syndrome_a, syndrome_a, const_m1
201	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
202	eor	syndrome_b, data1b, data2b
203	sel	syndrome_b, syndrome_b, const_m1
204	/* Can't use CBZ for backwards branch.  */
205	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
206	beq	.Lloop_aligned8
207
208.Ldiff_found:
209	cbnz	syndrome_a, .Ldiff_in_a
210
211.Ldiff_in_b:
212	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
213
214.Ldiff_in_a:
215	.cfi_restore_state
216	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
217
218	.cfi_restore_state
219.Lmisaligned8:
220	tst	tmp1, #3
221	bne	.Lmisaligned4
222	ands	tmp1, src1, #3
223	bne	.Lmutual_align4
224
225	/* Unrolled by a factor of 2, to reduce the number of post-increment
226	   operations.  */
227.Lloop_aligned4:
228	ldr	data1, [src1], #8
229	ldr	data2, [src2], #8
230.Lstart_realigned4:
231	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
232	eor	syndrome, data1, data2
233	sel	syndrome, syndrome, const_m1
234	cbnz	syndrome, .Laligned4_done
235	ldr	data1, [src1, #-4]
236	ldr	data2, [src2, #-4]
237	uadd8	syndrome, data1, const_m1
238	eor	syndrome, data1, data2
239	sel	syndrome, syndrome, const_m1
240	cmp	syndrome, #0
241	beq	.Lloop_aligned4
242
243.Laligned4_done:
244	strcmp_epilogue_aligned syndrome, data1, data2, 0
245
246.Lmutual_align4:
247	.cfi_restore_state
248	/* Deal with mutual misalignment by aligning downwards and then
249	   masking off the unwanted loaded data to prevent a difference.  */
250	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
251	bic	src1, src1, #3
252	ldr	data1, [src1], #8
253	bic	src2, src2, #3
254	ldr	data2, [src2], #8
255
256	/* In thumb code we can't use MVN with a register shift, but
257	   we do have ORN.  */
258	S2HI	tmp1, const_m1, tmp1
259	orn	data1, data1, tmp1
260	orn	data2, data2, tmp1
261	b	.Lstart_realigned4
262
263.Lmisaligned4:
264	ands	tmp1, src1, #3
265	beq	.Lsrc1_aligned
266	sub	src2, src2, tmp1
267	bic	src1, src1, #3
268	lsls	tmp1, tmp1, #31
269	ldr	data1, [src1], #4
270	beq	.Laligned_m2
271	bcs	.Laligned_m1
272
273#ifdef STRCMP_NO_PRECHECK
274	ldrb	data2, [src2, #1]
275	uxtb	tmp1, data1, ror #BYTE1_OFFSET
276	subs	tmp1, tmp1, data2
277	bne	.Lmisaligned_exit
278	cbz	data2, .Lmisaligned_exit
279
280.Laligned_m2:
281	ldrb	data2, [src2, #2]
282	uxtb	tmp1, data1, ror #BYTE2_OFFSET
283	subs	tmp1, tmp1, data2
284	bne	.Lmisaligned_exit
285	cbz	data2, .Lmisaligned_exit
286
287.Laligned_m1:
288	ldrb	data2, [src2, #3]
289	uxtb	tmp1, data1, ror #BYTE3_OFFSET
290	subs	tmp1, tmp1, data2
291	bne	.Lmisaligned_exit
292	add	src2, src2, #4
293	cbnz	data2, .Lsrc1_aligned
294#else  /* STRCMP_NO_PRECHECK */
295	/* If we've done the pre-check, then we don't need to check the
296	   first byte again here.  */
297	ldrb	data2, [src2, #2]
298	uxtb	tmp1, data1, ror #BYTE2_OFFSET
299	subs	tmp1, tmp1, data2
300	bne	.Lmisaligned_exit
301	cbz	data2, .Lmisaligned_exit
302
303.Laligned_m2:
304	ldrb	data2, [src2, #3]
305	uxtb	tmp1, data1, ror #BYTE3_OFFSET
306	subs	tmp1, tmp1, data2
307	bne	.Lmisaligned_exit
308	cbnz	data2, .Laligned_m1
309#endif
310
311.Lmisaligned_exit:
312	.cfi_remember_state
313	mov	result, tmp1
314	ldr	r4, [sp], #16
315	.cfi_restore 4
316	.cfi_adjust_cfa_offset -16
317	epilogue push_ip=HAVE_PAC_LEAF
318
319#ifndef STRCMP_NO_PRECHECK
320.Lfastpath_exit:
321	.cfi_restore_state
322	.cfi_remember_state
323	sub	r0, r2, r3
324	epilogue push_ip=HAVE_PAC_LEAF
325
326.Laligned_m1:
327	.cfi_restore_state
328	.cfi_remember_state
329	add	src2, src2, #4
330#endif
331.Lsrc1_aligned:
332	.cfi_restore_state
333	/* src1 is word aligned, but src2 has no common alignment
334	   with it.  */
335	ldr	data1, [src1], #4
336	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
337
338	bic	src2, src2, #3
339	ldr	data2, [src2], #4
340	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
341	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
342
343	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
344.Loverlap3:
345	bic	tmp1, data1, #MSB
346	uadd8	syndrome, data1, const_m1
347	eors	syndrome, tmp1, data2, S2LO #8
348	sel	syndrome, syndrome, const_m1
349	bne	4f
350	cbnz	syndrome, 5f
351	ldr	data2, [src2], #4
352	eor	tmp1, tmp1, data1
353	cmp	tmp1, data2, S2HI #24
354	bne	6f
355	ldr	data1, [src1], #4
356	b	.Loverlap3
3574:
358	S2LO	data2, data2, #8
359	b	.Lstrcmp_tail
360
3615:
362	bics	syndrome, syndrome, #MSB
363	bne	.Lstrcmp_done_equal
364
365	/* We can only get here if the MSB of data1 contains 0, so
366	   fast-path the exit.  */
367	ldrb	result, [src2]
368	.cfi_remember_state
369	ldrd	r4, r5, [sp], #16
370	.cfi_restore 4
371	.cfi_restore 5
372	/* R6/7 Not used in this sequence.  */
373	.cfi_restore 6
374	.cfi_restore 7
375	.cfi_adjust_cfa_offset -16
376	neg	result, result
377	epilogue push_ip=HAVE_PAC_LEAF
378
3796:
380	.cfi_restore_state
381	S2LO	data1, data1, #24
382	and	data2, data2, #LSB
383	b	.Lstrcmp_tail
384
385	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
386.Loverlap2:
387	and	tmp1, data1, const_m1, S2LO #16
388	uadd8	syndrome, data1, const_m1
389	eors	syndrome, tmp1, data2, S2LO #16
390	sel	syndrome, syndrome, const_m1
391	bne	4f
392	cbnz	syndrome, 5f
393	ldr	data2, [src2], #4
394	eor	tmp1, tmp1, data1
395	cmp	tmp1, data2, S2HI #16
396	bne	6f
397	ldr	data1, [src1], #4
398	b	.Loverlap2
3994:
400	S2LO	data2, data2, #16
401	b	.Lstrcmp_tail
4025:
403	ands	syndrome, syndrome, const_m1, S2LO #16
404	bne	.Lstrcmp_done_equal
405
406	ldrh	data2, [src2]
407	S2LO	data1, data1, #16
408#ifdef __ARM_BIG_ENDIAN
409	lsl	data2, data2, #16
410#endif
411	b	.Lstrcmp_tail
412
4136:
414	S2LO	data1, data1, #16
415	and	data2, data2, const_m1, S2LO #16
416	b	.Lstrcmp_tail
417
418	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
419.Loverlap1:
420	and	tmp1, data1, #LSB
421	uadd8	syndrome, data1, const_m1
422	eors	syndrome, tmp1, data2, S2LO #24
423	sel	syndrome, syndrome, const_m1
424	bne	4f
425	cbnz	syndrome, 5f
426	ldr	data2, [src2], #4
427	eor	tmp1, tmp1, data1
428	cmp	tmp1, data2, S2HI #8
429	bne	6f
430	ldr	data1, [src1], #4
431	b	.Loverlap1
4324:
433	S2LO	data2, data2, #24
434	b	.Lstrcmp_tail
4355:
436	tst	syndrome, #LSB
437	bne	.Lstrcmp_done_equal
438	ldr	data2, [src2]
4396:
440	S2LO	data1, data1, #8
441	bic	data2, data2, #MSB
442	b	.Lstrcmp_tail
443
444.Lstrcmp_done_equal:
445	mov	result, #0
446	.cfi_remember_state
447	ldrd	r4, r5, [sp], #16
448	.cfi_restore 4
449	.cfi_restore 5
450	/* R6/7 not used in this sequence.  */
451	.cfi_restore 6
452	.cfi_restore 7
453	.cfi_adjust_cfa_offset -16
454	epilogue push_ip=HAVE_PAC_LEAF
455
456.Lstrcmp_tail:
457	.cfi_restore_state
458#ifndef __ARM_BIG_ENDIAN
459	rev	data1, data1
460	rev	data2, data2
461	/* Now everything looks big-endian...  */
462#endif
463	uadd8	tmp1, data1, const_m1
464	eor	tmp1, data1, data2
465	sel	syndrome, tmp1, const_m1
466	clz	tmp1, syndrome
467	lsl	data1, data1, tmp1
468	lsl	data2, data2, tmp1
469	lsr	result, data1, #24
470	ldrd	r4, r5, [sp], #16
471	.cfi_restore 4
472	.cfi_restore 5
473	/* R6/7 not used in this sequence.  */
474	.cfi_restore 6
475	.cfi_restore 7
476	.cfi_adjust_cfa_offset -16
477	sub	result, result, data2, lsr #24
478	epilogue push_ip=HAVE_PAC_LEAF
479	.cfi_endproc
480	.cantunwind
481	.fnend
482	.size strcmp, . - strcmp
483