1/* Copyright (c) 2010-2011, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7
8      * Redistributions of source code must retain the above copyright
9      notice, this list of conditions and the following disclaimer.
10
11      * Redistributions in binary form must reproduce the above copyright
12      notice, this list of conditions and the following disclaimer in the
13      documentation and/or other materials provided with the distribution.
14
15      * Neither the name of Linaro Limited nor the names of its
16      contributors may be used to endorse or promote products derived
17      from this software without specific prior written permission.
18
19   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31   Written by Dave Gilbert <david.gilbert@linaro.org>
32
33   This memchr routine is optimised on a Cortex-A9 and should work on
34   all ARMv7 processors.   It has a fast path for short sizes, and has
35   an optimised path for large data sets; the worst case is finding the
36   match early in a large data set. */
37
38/* Copyright (c) 2015 ARM Ltd.
39   All rights reserved.
40
41   Redistribution and use in source and binary forms, with or without
42   modification, are permitted provided that the following conditions are met:
43       * Redistributions of source code must retain the above copyright
44	 notice, this list of conditions and the following disclaimer.
45       * Redistributions in binary form must reproduce the above copyright
46	 notice, this list of conditions and the following disclaimer in the
47	 documentation and/or other materials provided with the distribution.
48       * Neither the name of the Linaro nor the
49	 names of its contributors may be used to endorse or promote products
50	 derived from this software without specific prior written permission.
51
52   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
53   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
54   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
55   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
56   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
57   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
58   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
62   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
63
64@ 2011-02-07 david.gilbert@linaro.org
65@    Extracted from local git a5b438d861
66@ 2011-07-14 david.gilbert@linaro.org
67@    Import endianness fix from local git ea786f1b
68@ 2011-10-11 david.gilbert@linaro.org
69@    Import from cortex-strings bzr rev 63
70@    Flip to ldrd (as suggested by Greta Yorsh)
71@    Make conditional on CPU type
72@    tidy
73
74@ This code requires armv6t2 or later.  Uses Thumb2.
75
76	.syntax unified
77
78#include "arm_asm.h"
79
80@ NOTE: This ifdef MUST match the one in memchr-stub.c
81#if defined (__ARM_NEON__) || defined (__ARM_NEON)
82#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'R'
83	.arch	armv8-r
84#else
85	.arch	armv7-a
86#endif
87	.fpu	neon
88
89
90/* Arguments */
91#define srcin		r0
92#define chrin		r1
93#define cntin		r2
94
95/* Retval */
96#define result		r0	/* Live range does not overlap with srcin */
97
98/* Working registers */
99#define src		r1	/* Live range does not overlap with chrin */
100#define tmp		r3
101#define synd		r0	/* No overlap with srcin or result */
102#define soff		r12
103
104/* Working NEON registers */
105#define vrepchr		q0
106#define vdata0		q1
107#define vdata0_0	d2	/* Lower half of vdata0 */
108#define vdata0_1	d3	/* Upper half of vdata0 */
109#define vdata1		q2
110#define vdata1_0	d4	/* Lower half of vhas_chr0 */
111#define vdata1_1	d5	/* Upper half of vhas_chr0 */
112#define vrepmask	q3
113#define vrepmask0	d6
114#define vrepmask1	d7
115#define vend		q4
116#define vend0		d8
117#define vend1		d9
118
119/*
120 * Core algorithm:
121 *
122 * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
123 * byte. Each bit is set if the relevant byte matched the requested character
124 * and cleared otherwise. Since the bits in the syndrome reflect exactly the
125 * order in which things occur in the original string, counting trailing zeros
126 * allows to identify exactly which byte has matched.
127 */
128
129	.text
130	.thumb_func
131	.align 4
132	.p2align 4,,15
133	.global memchr
134	.type memchr,%function
135
136memchr:
137	.cfi_sections .debug_frame
138	.cfi_startproc
139	/* Use a simple loop if there are less than 8 bytes to search.  */
140	cmp	cntin, #7
141	bhi	.Llargestr
142	and	chrin, chrin, #0xff
143
144.Lsmallstr:
145	subs	cntin, cntin, #1
146	blo	.Lnotfound	/* Return not found if reached end.  */
147	ldrb	tmp, [srcin], #1
148	cmp	tmp, chrin
149	bne	.Lsmallstr	/* Loop again if not found.  */
150	/* Otherwise fixup address and return.  */
151	sub	result, result, #1
152	bx	lr
153
154
155.Llargestr:
156	vdup.8	vrepchr, chrin	/* Duplicate char across all lanes. */
157	/*
158	 * Magic constant 0x8040201008040201 allows us to identify which lane
159	 * matches the requested byte.
160	 */
161	movw	tmp, #0x0201
162	movt	tmp, #0x0804
163	lsl	soff, tmp, #4
164	vmov	vrepmask0, tmp, soff
165	vmov	vrepmask1, tmp, soff
166	/* Work with aligned 32-byte chunks */
167	bic	src, srcin, #31
168	ands	soff, srcin, #31
169	beq	.Lloopintro	/* Go straight to main loop if it's aligned. */
170
171	/*
172	 * Input string is not 32-byte aligned. We calculate the syndrome
173	 * value for the aligned 32 bytes block containing the first bytes
174	 * and mask the irrelevant part.
175	 */
176	vld1.8		{vdata0, vdata1}, [src:256]!
177	sub		tmp, soff, #32
178	adds		cntin, cntin, tmp
179	vceq.i8		vdata0, vdata0, vrepchr
180	vceq.i8		vdata1, vdata1, vrepchr
181	vand		vdata0, vdata0, vrepmask
182	vand		vdata1, vdata1, vrepmask
183	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
184	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
185	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
186	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
187	vmov		synd, vdata0_0[0]
188
189	/* Clear the soff lower bits */
190	lsr		synd, synd, soff
191	lsl		synd, synd, soff
192	/* The first block can also be the last */
193	bls		.Lmasklast
194	/* Have we found something already? */
195	cbnz		synd, .Ltail
196
197
198.Lloopintro:
199	vpush	{vend}
200	/* 264/265 correspond to d8/d9 for q4 */
201	.cfi_adjust_cfa_offset	16
202	.cfi_rel_offset	264, 0
203	.cfi_rel_offset	265, 8
204	.p2align 3,,7
205.Lloop:
206	vld1.8		{vdata0, vdata1}, [src:256]!
207	subs		cntin, cntin, #32
208	vceq.i8		vdata0, vdata0, vrepchr
209	vceq.i8		vdata1, vdata1, vrepchr
210	/* If we're out of data we finish regardless of the result. */
211	bls		.Lend
212	/* Use a fast check for the termination condition. */
213	vorr		vend, vdata0, vdata1
214	vorr		vend0, vend0, vend1
215	vmov		synd, tmp, vend0
216	orrs		synd, synd, tmp
217	/* We're not out of data, loop if we haven't found the character. */
218	beq		.Lloop
219
220.Lend:
221	vpop		{vend}
222	.cfi_adjust_cfa_offset	-16
223	.cfi_restore	264
224	.cfi_restore	265
225
226	/* Termination condition found, let's calculate the syndrome value. */
227	vand		vdata0, vdata0, vrepmask
228	vand		vdata1, vdata1, vrepmask
229	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
230	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
231	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
232	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
233	vmov		synd, vdata0_0[0]
234	cbz		synd, .Lnotfound
235	bhi		.Ltail
236
237
238.Lmasklast:
239	/* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
240	neg	cntin, cntin
241	lsl	synd, synd, cntin
242	lsrs	synd, synd, cntin
243	it	eq
244	moveq	src, #0	/* If no match, set src to 0 so the retval is 0. */
245
246
247.Ltail:
248	/* Count the trailing zeros using bit reversing */
249	rbit	synd, synd
250	/* Compensate the last post-increment */
251	sub	src, src, #32
252	/* Count the leading zeros */
253	clz	synd, synd
254	/* Compute the potential result and return */
255	add	result, src, synd
256	bx	lr
257
258
259.Lnotfound:
260	/* Set result to NULL if not found and return */
261	mov	result, #0
262	bx	lr
263
264	.cfi_endproc
265	.size	memchr, . - memchr
266
267#elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP)
268
269#if __ARM_ARCH_PROFILE == 'M'
270#if __ARM_ARCH >= 8
271	/* keep config inherited from -march=.  */
272#else
273	.arch armv7e-m
274#endif /* __ARM_ARCH >= 8 */
275#else
276	.arch armv6t2
277#endif /* __ARM_ARCH_PROFILE == 'M' */
278
279@ this lets us check a flag in a 00/ff byte easily in either endianness
280#ifdef __ARMEB__
281#define CHARTSTMASK(c) 1<<(31-(c*8))
282#else
283#define CHARTSTMASK(c) 1<<(c*8)
284#endif
285	.text
286	.thumb
287
288@ ---------------------------------------------------------------------------
289	.thumb_func
290	.align 2
291	.p2align 4,,15
292	.global memchr
293	.type memchr,%function
294	.fnstart
295	.cfi_startproc
296memchr:
297	@ r0 = start of memory to scan
298	@ r1 = character to look for
299	@ r2 = length
300	@ returns r0 = pointer to character or NULL if not found
301	prologue
302	and	r1,r1,#0xff	@ Don't trust the caller to pass a char
303
304	cmp	r2,#16		@ If short don't bother with anything clever
305	blt	20f
306
307	tst	r0, #7		@ If it's already aligned skip the next bit
308	beq	10f
309
310	@ Work up to an aligned point
3115:
312	ldrb	r3, [r0],#1
313	subs	r2, r2, #1
314	cmp	r3, r1
315	beq	50f		@ If it matches exit found
316	tst	r0, #7
317	cbz	r2, 40f		@ If we run off the end, exit not found
318	bne	5b		@ If not aligned yet then do next byte
319
32010:
321	@ We are aligned, we know we have at least 8 bytes to work with
322	push	{r4,r5,r6,r7}
323	.cfi_adjust_cfa_offset 16
324	.cfi_rel_offset 4, 0
325	.cfi_rel_offset 5, 4
326	.cfi_rel_offset 6, 8
327	.cfi_rel_offset 7, 12
328	orr	r1, r1, r1, lsl #8	@ expand the match word across all bytes
329	orr	r1, r1, r1, lsl #16
330	bic	r4, r2, #7	@ Number of double words to work with * 8
331	mvns	r7, #0		@ all F's
332	movs	r3, #0
333
33415:
335	ldrd    r5,r6,[r0],#8
336	subs	r4, r4, #8
337	eor	r5,r5, r1	@ r5,r6 have 00's where bytes match the target
338	eor	r6,r6, r1
339	uadd8	r5, r5, r7	@ Par add 0xff - sets GE bits for bytes!=0
340	sel	r5, r3, r7	@ bytes are 00 for none-00 bytes,
341				@ or ff for 00 bytes - NOTE INVERSION
342	uadd8	r6, r6, r7	@ Par add 0xff - sets GE bits for bytes!=0
343	sel	r6, r5, r7	@ chained....bytes are 00 for none-00 bytes
344				@ or ff for 00 bytes - NOTE INVERSION
345	cbnz	r6, 60f
346	bne	15b		@ (Flags from the subs above)
347
348	pop	{r4,r5,r6,r7}
349	.cfi_restore 7
350	.cfi_restore 6
351	.cfi_restore 5
352	.cfi_restore 4
353	.cfi_adjust_cfa_offset -16
354	and	r1,r1,#0xff	@ r1 back to a single character
355	and	r2,r2,#7	@ Leave the count remaining as the number
356				@ after the double words have been done
357
35820:
359	cbz	r2, 40f		@ 0 length or hit the end already then not found
360
36121:  @ Post aligned section, or just a short call
362	ldrb	r3,[r0],#1
363	subs	r2,r2,#1
364	eor	r3,r3,r1	@ r3 = 0 if match - doesn't break flags from sub
365	cbz	r3, 50f
366	bne	21b		@ on r2 flags
367
36840:
369	.cfi_remember_state
370	movs	r0,#0		@ not found
371	epilogue
372
37350:
374	.cfi_restore_state
375	.cfi_remember_state
376	subs	r0,r0,#1	@ found
377	epilogue
378
37960:  @ We're here because the fast path found a hit
380     @ now we have to track down exactly which word it was
381	@ r0 points to the start of the double word after the one tested
382	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
383	@ This point is reached from cbnz midway through label 15 prior to
384	@ popping r4-r7 off the stack.  .cfi_restore_state alone disregards
385	@ this, so we manually correct this.
386	.cfi_restore_state	@ Standard post-prologue state
387	.cfi_adjust_cfa_offset 16
388	.cfi_rel_offset 4, 0
389	.cfi_rel_offset 5, 4
390	.cfi_rel_offset 6, 8
391	.cfi_rel_offset 7, 12
392	cmp	r5, #0
393	itte	eq
394	moveq	r5, r6		@ the end is in the 2nd word
395	subeq	r0,r0,#3	@ Points to 2nd byte of 2nd word
396	subne	r0,r0,#7	@ or 2nd byte of 1st word
397
398	@ r0 currently points to the 2nd byte of the word containing the hit
399	tst	r5, # CHARTSTMASK(0)	@ 1st character
400	bne	61f
401	adds	r0,r0,#1
402	tst	r5, # CHARTSTMASK(1)	@ 2nd character
403	ittt	eq
404	addeq	r0,r0,#1
405	tsteq	r5, # (3<<15)		@ 2nd & 3rd character
406	@ If not the 3rd must be the last one
407	addeq	r0,r0,#1
408
40961:
410	pop	{r4,r5,r6,r7}
411	.cfi_restore 7
412	.cfi_restore 6
413	.cfi_restore 5
414	.cfi_restore 4
415	.cfi_adjust_cfa_offset -16
416	subs	r0,r0,#1
417	epilogue
418	.cfi_endproc
419	.cantunwind
420	.fnend
421#else
422  /* Defined in memchr.c.  */
423#endif
424