1/*
2 * Copyright (c) 2012-2014 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29	/* Implementation of strcmp for ARMv6.  Use ldrd to support wider
30	   loads, provided the data is sufficiently aligned.  Use
31	   saturating arithmetic to optimize the compares.  */
32
33	/* Build Options:
34	   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
35	   byte in the string.  If comparing completely random strings
36	   the pre-check will save time, since there is a very high
37	   probability of a mismatch in the first character: we save
38	   significant overhead if this is the common case.  However,
39	   if strings are likely to be identical (eg because we're
40	   verifying a hit in a hash table), then this check is largely
41	   redundant.  */
42
43	.arm
44
45/* Parameters and result.  */
46#define src1		r0
47#define src2		r1
48#define result		r0	/* Overlaps src1.  */
49
50/* Internal variables.  */
51#define tmp1		r4
52#define tmp2		r5
53#define const_m1	r12
54
55/* Additional internal variables for 64-bit aligned data.  */
56#define data1a		r2
57#define data1b		r3
58#define data2a		r6
59#define data2b		r7
60#define syndrome_a	tmp1
61#define syndrome_b	tmp2
62
63/* Additional internal variables for 32-bit aligned data.  */
64#define data1		r2
65#define data2		r3
66#define syndrome	tmp2
67
68
69	/* Macro to compute and return the result value for word-aligned
70	   cases.  */
71	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
72#ifdef __ARM_BIG_ENDIAN
73	/* If data1 contains a zero byte, then syndrome will contain a 1 in
74	   bit 7 of that byte.  Otherwise, the highest set bit in the
75	   syndrome will highlight the first different bit.  It is therefore
76	   sufficient to extract the eight bits starting with the syndrome
77	   bit.  */
78	clz	tmp1, \synd
79	lsl	r1, \d2, tmp1
80	.if \restore_r6
81	ldrd	r6, r7, [sp, #8]
82	.endif
83	.cfi_restore 6
84	.cfi_restore 7
85	lsl	\d1, \d1, tmp1
86	.cfi_remember_state
87	lsr	result, \d1, #24
88	ldrd	r4, r5, [sp], #16
89	.cfi_restore 4
90	.cfi_restore 5
91	sub	result, result, r1, lsr #24
92	bx	lr
93#else
94	/* To use the big-endian trick we'd have to reverse all three words.
95	   that's slower than this approach.  */
96	rev	\synd, \synd
97	clz	tmp1, \synd
98	bic	tmp1, tmp1, #7
99	lsr	r1, \d2, tmp1
100	.cfi_remember_state
101	.if \restore_r6
102	ldrd	r6, r7, [sp, #8]
103	.endif
104	.cfi_restore 6
105	.cfi_restore 7
106	lsr	\d1, \d1, tmp1
107	and	result, \d1, #255
108	and	r1, r1, #255
109	ldrd	r4, r5, [sp], #16
110	.cfi_restore 4
111	.cfi_restore 5
112	sub	result, result, r1
113
114	bx	lr
115#endif
116	.endm
117
118	.text
119	.p2align	5
120.Lstrcmp_start_addr:
121#ifndef STRCMP_NO_PRECHECK
122.Lfastpath_exit:
123	sub	r0, r2, r3
124	bx	lr
125#endif
126def_fn	strcmp
127#ifndef STRCMP_NO_PRECHECK
128	ldrb	r2, [src1]
129	ldrb	r3, [src2]
130	cmp	r2, #1
131	cmpcs	r2, r3
132	bne	.Lfastpath_exit
133#endif
134	.cfi_sections .debug_frame
135	.cfi_startproc
136	strd	r4, r5, [sp, #-16]!
137	.cfi_def_cfa_offset 16
138	.cfi_offset 4, -16
139	.cfi_offset 5, -12
140	orr	tmp1, src1, src2
141	strd	r6, r7, [sp, #8]
142	.cfi_offset 6, -8
143	.cfi_offset 7, -4
144	mvn	const_m1, #0
145	tst	tmp1, #7
146	beq	.Lloop_aligned8
147
148.Lnot_aligned:
149	eor	tmp1, src1, src2
150	tst	tmp1, #7
151	bne	.Lmisaligned8
152
153	/* Deal with mutual misalignment by aligning downwards and then
154	   masking off the unwanted loaded data to prevent a difference.  */
155	and	tmp1, src1, #7
156	bic	src1, src1, #7
157	and	tmp2, tmp1, #3
158	bic	src2, src2, #7
159	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
160	ldrd	data1a, data1b, [src1], #16
161	tst	tmp1, #4
162	ldrd	data2a, data2b, [src2], #16
163	/* In ARM code we can't use ORN, but with do have MVN with a
164	   register shift.  */
165	mvn	tmp1, const_m1, S2HI tmp2
166	orr	data1a, data1a, tmp1
167	orr	data2a, data2a, tmp1
168	beq	.Lstart_realigned8
169	orr	data1b, data1b, tmp1
170	mov	data1a, const_m1
171	orr	data2b, data2b, tmp1
172	mov	data2a, const_m1
173	b	.Lstart_realigned8
174
175	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
176	   pass.  */
177	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
178	.p2align 2	/* Always word aligned.  */
179.Lloop_aligned8:
180	ldrd	data1a, data1b, [src1], #16
181	ldrd	data2a, data2b, [src2], #16
182.Lstart_realigned8:
183	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
184	eor	syndrome_a, data1a, data2a
185	sel	syndrome_a, syndrome_a, const_m1
186	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
187	eor	syndrome_b, data1b, data2b
188	sel	syndrome_b, syndrome_b, const_m1
189	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
190	bne	.Ldiff_found
191
192	ldrd	data1a, data1b, [src1, #-8]
193	ldrd	data2a, data2b, [src2, #-8]
194	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
195	eor	syndrome_a, data1a, data2a
196	sel	syndrome_a, syndrome_a, const_m1
197	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
198	eor	syndrome_b, data1b, data2b
199	sel	syndrome_b, syndrome_b, const_m1
200	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
201	beq	.Lloop_aligned8
202
203.Ldiff_found:
204	cmp	syndrome_a, #0
205	bne	.Ldiff_in_a
206
207.Ldiff_in_b:
208	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
209
210.Ldiff_in_a:
211	.cfi_restore_state
212	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
213
214	.cfi_restore_state
215.Lmisaligned8:
216	tst	tmp1, #3
217	bne	.Lmisaligned4
218	ands	tmp1, src1, #3
219	bne	.Lmutual_align4
220
221	/* Unrolled by a factor of 2, to reduce the number of post-increment
222	   operations.  */
223.Lloop_aligned4:
224	ldr	data1, [src1], #8
225	ldr	data2, [src2], #8
226.Lstart_realigned4:
227	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
228	eor	syndrome, data1, data2
229	sel	syndrome, syndrome, const_m1
230	cmp	syndrome, #0
231	bne	.Laligned4_done
232
233	ldr	data1, [src1, #-4]
234	ldr	data2, [src2, #-4]
235	uadd8	syndrome, data1, const_m1
236	eor	syndrome, data1, data2
237	sel	syndrome, syndrome, const_m1
238	cmp	syndrome, #0
239	beq	.Lloop_aligned4
240
241.Laligned4_done:
242	strcmp_epilogue_aligned syndrome, data1, data2, 0
243
244.Lmutual_align4:
245	.cfi_restore_state
246	/* Deal with mutual misalignment by aligning downwards and then
247	   masking off the unwanted loaded data to prevent a difference.  */
248	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
249	bic	src1, src1, #3
250	ldr	data1, [src1], #8
251	bic	src2, src2, #3
252	ldr	data2, [src2], #8
253
254	/* In ARM code we can't use ORN, but with do have MVN with a
255	   register shift.  */
256	mvn	tmp1, const_m1, S2HI tmp1
257	orr	data1, data1, tmp1
258	orr	data2, data2, tmp1
259	b	.Lstart_realigned4
260
261.Lmisaligned4:
262	ands	tmp1, src1, #3
263	beq	.Lsrc1_aligned
264	sub	src2, src2, tmp1
265	bic	src1, src1, #3
266	lsls	tmp1, tmp1, #31
267	ldr	data1, [src1], #4
268	beq	.Laligned_m2
269	bcs	.Laligned_m1
270
271#ifdef STRCMP_NO_PRECHECK
272	ldrb	data2, [src2, #1]
273	uxtb	tmp1, data1, ror #BYTE1_OFFSET
274	cmp	tmp1, #1
275	cmpcs	tmp1, data2
276	bne	.Lmisaligned_exit
277
278.Laligned_m2:
279	ldrb	data2, [src2, #2]
280	uxtb	tmp1, data1, ror #BYTE2_OFFSET
281	cmp	tmp1, #1
282	cmpcs	tmp1, data2
283	bne	.Lmisaligned_exit
284
285.Laligned_m1:
286	ldrb	data2, [src2, #3]
287	uxtb	tmp1, data1, ror #BYTE3_OFFSET
288	cmp	tmp1, #1
289	cmpcs	tmp1, data2
290	beq	.Lsrc1_aligned
291
292#else  /* STRCMP_NO_PRECHECK */
293	/* If we've done the pre-check, then we don't need to check the
294	   first byte again here.  */
295	ldrb	data2, [src2, #2]
296	uxtb	tmp1, data1, ror #BYTE2_OFFSET
297	cmp	tmp1, #1
298	cmpcs	tmp1, data2
299	bne	.Lmisaligned_exit
300
301.Laligned_m2:
302	ldrb	data2, [src2, #3]
303	uxtb	tmp1, data1, ror #BYTE3_OFFSET
304	cmp	tmp1, #1
305	cmpcs	tmp1, data2
306	beq	.Laligned_m1
307#endif
308
309.Lmisaligned_exit:
310	.cfi_remember_state
311	sub	result, tmp1, data2
312	ldr	r4, [sp], #16
313	.cfi_restore 4
314	bx	lr
315
316#ifndef STRCMP_NO_PRECHECK
317.Laligned_m1:
318	add	src2, src2, #4
319#endif
320.Lsrc1_aligned:
321	.cfi_restore_state
322	/* src1 is word aligned, but src2 has no common alignment
323	   with it.  */
324	ldr	data1, [src1], #4
325	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
326
327	bic	src2, src2, #3
328	ldr	data2, [src2], #4
329	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
330	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
331
332	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
333.Loverlap3:
334	bic	tmp1, data1, #MSB
335	uadd8	syndrome, data1, const_m1
336	eors	syndrome, tmp1, data2, S2LO #8
337	sel	syndrome, syndrome, const_m1
338	bne	4f
339	cmp	syndrome, #0
340	ldreq	data2, [src2], #4
341	bne	5f
342
343	eor	tmp1, tmp1, data1
344	cmp	tmp1, data2, S2HI #24
345	bne	6f
346	ldr	data1, [src1], #4
347	b	.Loverlap3
3484:
349	S2LO	data2, data2, #8
350	b	.Lstrcmp_tail
351
3525:
353	bics	syndrome, syndrome, #MSB
354	bne	.Lstrcmp_done_equal
355
356	/* We can only get here if the MSB of data1 contains 0, so
357	   fast-path the exit.  */
358	ldrb	result, [src2]
359	.cfi_remember_state
360	ldrd	r4, r5, [sp], #16
361	.cfi_restore 4
362	.cfi_restore 5
363	/* R6/7 Not used in this sequence.  */
364	.cfi_restore 6
365	.cfi_restore 7
366	neg	result, result
367	bx	lr
368
3696:
370	.cfi_restore_state
371	S2LO	data1, data1, #24
372	and	data2, data2, #LSB
373	b	.Lstrcmp_tail
374
375	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
376.Loverlap2:
377	and	tmp1, data1, const_m1, S2LO #16
378	uadd8	syndrome, data1, const_m1
379	eors	syndrome, tmp1, data2, S2LO #16
380	sel	syndrome, syndrome, const_m1
381	bne	4f
382	cmp	syndrome, #0
383	ldreq	data2, [src2], #4
384	bne	5f
385	eor	tmp1, tmp1, data1
386	cmp	tmp1, data2, S2HI #16
387	bne	6f
388	ldr	data1, [src1], #4
389	b	.Loverlap2
3904:
391	S2LO	data2, data2, #16
392	b	.Lstrcmp_tail
3935:
394	ands	syndrome, syndrome, const_m1, S2LO #16
395	bne	.Lstrcmp_done_equal
396
397	ldrh	data2, [src2]
398	S2LO	data1, data1, #16
399#ifdef __ARM_BIG_ENDIAN
400	lsl	data2, data2, #16
401#endif
402	b	.Lstrcmp_tail
403
4046:
405	S2LO	data1, data1, #16
406	and	data2, data2, const_m1, S2LO #16
407	b	.Lstrcmp_tail
408
409	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
410.Loverlap1:
411	and	tmp1, data1, #LSB
412	uadd8	syndrome, data1, const_m1
413	eors	syndrome, tmp1, data2, S2LO #24
414	sel	syndrome, syndrome, const_m1
415	bne	4f
416	cmp	syndrome, #0
417	ldreq	data2, [src2], #4
418	bne	5f
419	eor	tmp1, tmp1, data1
420	cmp	tmp1, data2, S2HI #8
421	bne	6f
422	ldr	data1, [src1], #4
423	b	.Loverlap1
4244:
425	S2LO	data2, data2, #24
426	b	.Lstrcmp_tail
4275:
428	tst	syndrome, #LSB
429	bne	.Lstrcmp_done_equal
430	ldr	data2, [src2]
4316:
432	S2LO	data1, data1, #8
433	bic	data2, data2, #MSB
434	b	.Lstrcmp_tail
435
436.Lstrcmp_done_equal:
437	mov	result, #0
438	.cfi_remember_state
439	ldrd	r4, r5, [sp], #16
440	.cfi_restore 4
441	.cfi_restore 5
442	/* R6/7 not used in this sequence.  */
443	.cfi_restore 6
444	.cfi_restore 7
445	bx	lr
446
447.Lstrcmp_tail:
448	.cfi_restore_state
449#ifndef __ARM_BIG_ENDIAN
450	rev	data1, data1
451	rev	data2, data2
452	/* Now everything looks big-endian...  */
453#endif
454	uadd8	tmp1, data1, const_m1
455	eor	tmp1, data1, data2
456	sel	syndrome, tmp1, const_m1
457	clz	tmp1, syndrome
458	lsl	data1, data1, tmp1
459	lsl	data2, data2, tmp1
460	lsr	result, data1, #24
461	ldrd	r4, r5, [sp], #16
462	.cfi_restore 4
463	.cfi_restore 5
464	/* R6/7 not used in this sequence.  */
465	.cfi_restore 6
466	.cfi_restore 7
467	sub	result, result, data2, lsr #24
468	bx	lr
469	.cfi_endproc
470	.size strcmp, . - .Lstrcmp_start_addr
471