1/* Copyright (c) 2010-2011, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7
8      * Redistributions of source code must retain the above copyright
9      notice, this list of conditions and the following disclaimer.
10
11      * Redistributions in binary form must reproduce the above copyright
12      notice, this list of conditions and the following disclaimer in the
13      documentation and/or other materials provided with the distribution.
14
15      * Neither the name of Linaro Limited nor the names of its
16      contributors may be used to endorse or promote products derived
17      from this software without specific prior written permission.
18
19   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31   Written by Dave Gilbert <david.gilbert@linaro.org>
32
33   This memchr routine is optimised on a Cortex-A9 and should work on
34   all ARMv7 processors.   It has a fast path for short sizes, and has
35   an optimised path for large data sets; the worst case is finding the
36   match early in a large data set. */
37
38/* Copyright (c) 2015 ARM Ltd.
39   All rights reserved.
40
41   Redistribution and use in source and binary forms, with or without
42   modification, are permitted provided that the following conditions are met:
43       * Redistributions of source code must retain the above copyright
44	 notice, this list of conditions and the following disclaimer.
45       * Redistributions in binary form must reproduce the above copyright
46	 notice, this list of conditions and the following disclaimer in the
47	 documentation and/or other materials provided with the distribution.
48       * Neither the name of the Linaro nor the
49	 names of its contributors may be used to endorse or promote products
50	 derived from this software without specific prior written permission.
51
52   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
53   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
54   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
55   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
56   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
57   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
58   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
62   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
63
64// 2011-02-07 david.gilbert@linaro.org
65//    Extracted from local git a5b438d861
66// 2011-07-14 david.gilbert@linaro.org
67//    Import endianness fix from local git ea786f1b
68// 2011-10-11 david.gilbert@linaro.org
69//    Import from cortex-strings bzr rev 63
70//    Flip to ldrd (as suggested by Greta Yorsh)
71//    Make conditional on CPU type
72//    tidy
73
74// This code requires armv6t2 or later.  Uses Thumb2.
75
76#include <picolibc.h>
77
78	.syntax unified
79
80#include "arm_asm.h"
81
82// NOTE: This ifdef MUST match the one in memchr-stub.c
83#if defined (__ARM_NEON__) || defined (__ARM_NEON)
84#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'R'
85	.arch	armv8-r
86#else
87	.arch	armv7-a
88#endif
89	.fpu	neon
90
91
92/* Arguments */
93#define srcin		r0
94#define chrin		r1
95#define cntin		r2
96
97/* Retval */
98#define result		r0	/* Live range does not overlap with srcin */
99
100/* Working registers */
101#define src		r1	/* Live range does not overlap with chrin */
102#define tmp		r3
103#define synd		r0	/* No overlap with srcin or result */
104#define soff		r12
105
106/* Working NEON registers */
107#define vrepchr		q0
108#define vdata0		q1
109#define vdata0_0	d2	/* Lower half of vdata0 */
110#define vdata0_1	d3	/* Upper half of vdata0 */
111#define vdata1		q2
112#define vdata1_0	d4	/* Lower half of vhas_chr0 */
113#define vdata1_1	d5	/* Upper half of vhas_chr0 */
114#define vrepmask	q3
115#define vrepmask0	d6
116#define vrepmask1	d7
117#define vend		q4
118#define vend0		d8
119#define vend1		d9
120
121/*
122 * Core algorithm:
123 *
124 * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
125 * byte. Each bit is set if the relevant byte matched the requested character
126 * and cleared otherwise. Since the bits in the syndrome reflect exactly the
127 * order in which things occur in the original string, counting trailing zeros
128 * allows to identify exactly which byte has matched.
129 */
130
131	.text
132	.thumb_func
133	.align 4
134	.p2align 4,,15
135	.global memchr
136	.type memchr,%function
137
138memchr:
139	.cfi_sections .debug_frame
140	.cfi_startproc
141	/* Use a simple loop if there are less than 8 bytes to search.  */
142	cmp	cntin, #7
143	bhi	.Llargestr
144	and	chrin, chrin, #0xff
145
146.Lsmallstr:
147	subs	cntin, cntin, #1
148	blo	.Lnotfound	/* Return not found if reached end.  */
149	ldrb	tmp, [srcin], #1
150	cmp	tmp, chrin
151	bne	.Lsmallstr	/* Loop again if not found.  */
152	/* Otherwise fixup address and return.  */
153	sub	result, result, #1
154	bx	lr
155
156
157.Llargestr:
158	vdup.8	vrepchr, chrin	/* Duplicate char across all lanes. */
159	/*
160	 * Magic constant 0x8040201008040201 allows us to identify which lane
161	 * matches the requested byte.
162	 */
163	movw	tmp, #0x0201
164	movt	tmp, #0x0804
165	lsl	soff, tmp, #4
166	vmov	vrepmask0, tmp, soff
167	vmov	vrepmask1, tmp, soff
168	/* Work with aligned 32-byte chunks */
169	bic	src, srcin, #31
170	ands	soff, srcin, #31
171	beq	.Lloopintro	/* Go straight to main loop if it's aligned. */
172
173	/*
174	 * Input string is not 32-byte aligned. We calculate the syndrome
175	 * value for the aligned 32 bytes block containing the first bytes
176	 * and mask the irrelevant part.
177	 */
178	vld1.8		{vdata0, vdata1}, [src:256]!
179	sub		tmp, soff, #32
180	adds		cntin, cntin, tmp
181	vceq.i8		vdata0, vdata0, vrepchr
182	vceq.i8		vdata1, vdata1, vrepchr
183	vand		vdata0, vdata0, vrepmask
184	vand		vdata1, vdata1, vrepmask
185	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
186	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
187	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
188	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
189	vmov		synd, vdata0_0[0]
190
191	/* Clear the soff lower bits */
192	lsr		synd, synd, soff
193	lsl		synd, synd, soff
194	/* The first block can also be the last */
195	bls		.Lmasklast
196	/* Have we found something already? */
197	cbnz		synd, .Ltail
198
199
200.Lloopintro:
201	vpush	{vend}
202	/* 264/265 correspond to d8/d9 for q4 */
203	.cfi_adjust_cfa_offset	16
204	.cfi_rel_offset	264, 0
205	.cfi_rel_offset	265, 8
206	.p2align 3,,7
207.Lloop:
208	vld1.8		{vdata0, vdata1}, [src:256]!
209	subs		cntin, cntin, #32
210	vceq.i8		vdata0, vdata0, vrepchr
211	vceq.i8		vdata1, vdata1, vrepchr
212	/* If we're out of data we finish regardless of the result. */
213	bls		.Lend
214	/* Use a fast check for the termination condition. */
215	vorr		vend, vdata0, vdata1
216	vorr		vend0, vend0, vend1
217	vmov		synd, tmp, vend0
218	orrs		synd, synd, tmp
219	/* We're not out of data, loop if we haven't found the character. */
220	beq		.Lloop
221
222.Lend:
223	vpop		{vend}
224	.cfi_adjust_cfa_offset	-16
225	.cfi_restore	264
226	.cfi_restore	265
227
228	/* Termination condition found, let's calculate the syndrome value. */
229	vand		vdata0, vdata0, vrepmask
230	vand		vdata1, vdata1, vrepmask
231	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
232	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
233	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
234	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
235	vmov		synd, vdata0_0[0]
236	cbz		synd, .Lnotfound
237	bhi		.Ltail
238
239
240.Lmasklast:
241	/* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
242	neg	cntin, cntin
243	lsl	synd, synd, cntin
244	lsrs	synd, synd, cntin
245	it	eq
246	moveq	src, #0	/* If no match, set src to 0 so the retval is 0. */
247
248
249.Ltail:
250	/* Count the trailing zeros using bit reversing */
251	rbit	synd, synd
252	/* Compensate the last post-increment */
253	sub	src, src, #32
254	/* Count the leading zeros */
255	clz	synd, synd
256	/* Compute the potential result and return */
257	add	result, src, synd
258	bx	lr
259
260
261.Lnotfound:
262	/* Set result to NULL if not found and return */
263	mov	result, #0
264	bx	lr
265
266	.cfi_endproc
267	.size	memchr, . - memchr
268
269#elif __ARM_ARCH_ISA_THUMB >= 2 && defined (__ARM_FEATURE_DSP)
270
271#if __ARM_ARCH_PROFILE == 'M'
272#if __ARM_ARCH >= 8
273	/* keep config inherited from -march=.  */
274#else
275	.arch armv7e-m
276#endif /* __ARM_ARCH >= 8 */
277#else
278	.arch armv6t2
279#endif /* __ARM_ARCH_PROFILE == 'M' */
280
281// this lets us check a flag in a 00/ff byte easily in either endianness
282#ifdef __ARMEB__
283#define CHARTSTMASK(c) 1<<(31-(c*8))
284#else
285#define CHARTSTMASK(c) 1<<(c*8)
286#endif
287	.text
288	.thumb
289
290// ---------------------------------------------------------------------------
291	.thumb_func
292	.align 2
293	.p2align 4,,15
294	.global memchr
295	.type memchr,%function
296	.fnstart
297	.cfi_sections .debug_frame
298	.cfi_startproc
299memchr:
300	// r0 = start of memory to scan
301	// r1 = character to look for
302	// r2 = length
303	// returns r0 = pointer to character or NULL if not found
304	prologue
305	and	r1,r1,#0xff	// Don't trust the caller to pass a char
306
307	cmp	r2,#16		// If short don't bother with anything clever
308	blt	20f
309
310	tst	r0, #7		// If it's already aligned skip the next bit
311	beq	10f
312
313	// Work up to an aligned point
3145:
315	ldrb	r3, [r0],#1
316	subs	r2, r2, #1
317	cmp	r3, r1
318	beq	50f		// If it matches exit found
319	tst	r0, #7
320	cbz	r2, 40f		// If we run off the end, exit not found
321	bne	5b		// If not aligned yet then do next byte
322
32310:
324	// We are aligned, we know we have at least 8 bytes to work with
325	push	{r4,r5,r6,r7}
326	.cfi_adjust_cfa_offset 16
327	.cfi_rel_offset 4, 0
328	.cfi_rel_offset 5, 4
329	.cfi_rel_offset 6, 8
330	.cfi_rel_offset 7, 12
331	orr	r1, r1, r1, lsl #8	// expand the match word across all bytes
332	orr	r1, r1, r1, lsl #16
333	bic	r4, r2, #7	// Number of double words to work with * 8
334	mvns	r7, #0		// all F's
335	movs	r3, #0
336
33715:
338	ldrd    r5,r6,[r0],#8
339	subs	r4, r4, #8
340	eor	r5,r5, r1	// r5,r6 have 00's where bytes match the target
341	eor	r6,r6, r1
342	uadd8	r5, r5, r7	// Par add 0xff - sets GE bits for bytes!=0
343	sel	r5, r3, r7	// bytes are 00 for none-00 bytes,
344				// or ff for 00 bytes - NOTE INVERSION
345	uadd8	r6, r6, r7	// Par add 0xff - sets GE bits for bytes!=0
346	sel	r6, r5, r7	// chained....bytes are 00 for none-00 bytes
347				// or ff for 00 bytes - NOTE INVERSION
348	cbnz	r6, 60f
349	bne	15b		// (Flags from the subs above)
350
351	pop	{r4,r5,r6,r7}
352	.cfi_restore 7
353	.cfi_restore 6
354	.cfi_restore 5
355	.cfi_restore 4
356	.cfi_adjust_cfa_offset -16
357	and	r1,r1,#0xff	// r1 back to a single character
358	and	r2,r2,#7	// Leave the count remaining as the number
359				// after the double words have been done
360
36120:
362	cbz	r2, 40f		// 0 length or hit the end already then not found
363
36421:  // Post aligned section, or just a short call
365	ldrb	r3,[r0],#1
366	subs	r2,r2,#1
367	eor	r3,r3,r1	// r3 = 0 if match - doesn't break flags from sub
368	cbz	r3, 50f
369	bne	21b		// on r2 flags
370
37140:
372	.cfi_remember_state
373	movs	r0,#0		// not found
374	epilogue
375
37650:
377	.cfi_restore_state
378	.cfi_remember_state
379	subs	r0,r0,#1	// found
380	epilogue
381
38260:  // We're here because the fast path found a hit
383     // now we have to track down exactly which word it was
384	// r0 points to the start of the double word after the one tested
385	// r5 has the 00/ff pattern for the first word, r6 has the chained value
386	// This point is reached from cbnz midway through label 15 prior to
387	// popping r4-r7 off the stack.  .cfi_restore_state alone disregards
388	// this, so we manually correct this.
389	.cfi_restore_state	// Standard post-prologue state
390	.cfi_adjust_cfa_offset 16
391	.cfi_rel_offset 4, 0
392	.cfi_rel_offset 5, 4
393	.cfi_rel_offset 6, 8
394	.cfi_rel_offset 7, 12
395	cmp	r5, #0
396	itte	eq
397	moveq	r5, r6		// the end is in the 2nd word
398	subeq	r0,r0,#3	// Points to 2nd byte of 2nd word
399	subne	r0,r0,#7	// or 2nd byte of 1st word
400
401	// r0 currently points to the 2nd byte of the word containing the hit
402	tst	r5, # CHARTSTMASK(0)	// 1st character
403	bne	61f
404	adds	r0,r0,#1
405	tst	r5, # CHARTSTMASK(1)	// 2nd character
406	ittt	eq
407	addeq	r0,r0,#1
408	tsteq	r5, # (3<<15)		// 2nd & 3rd character
409	// If not the 3rd must be the last one
410	addeq	r0,r0,#1
411
41261:
413	pop	{r4,r5,r6,r7}
414	.cfi_restore 7
415	.cfi_restore 6
416	.cfi_restore 5
417	.cfi_restore 4
418	.cfi_adjust_cfa_offset -16
419	subs	r0,r0,#1
420	epilogue
421	.cfi_endproc
422	.cantunwind
423	.fnend
424#else
425  /* Defined in memchr.c.  */
426#endif
427