1/*
2   strrchr - find last instance of a character in a string
3
4   Copyright (c) 2014, ARM Limited
5   All rights Reserved.
6
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions are met:
9       * Redistributions of source code must retain the above copyright
10         notice, this list of conditions and the following disclaimer.
11       * Redistributions in binary form must reproduce the above copyright
12         notice, this list of conditions and the following disclaimer in the
13         documentation and/or other materials provided with the distribution.
14       * Neither the name of the company nor the names of its contributors
15         may be used to endorse or promote products derived from this
16         software without specific prior written permission.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
29
30#include <picolibc.h>
31
32#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) || !defined(__ARM_NEON)
33/* See strrchr-stub.c  */
34#else
35
36/* Assumptions:
37 *
38 * ARMv8-a, AArch64
39 * Neon Available.
40 */
41
42#include "asmdefs.h"
43
44/* Arguments and results.  */
45#define srcin		x0
46#define chrin		w1
47
48#define result		x0
49
50#define src		x2
51#define	tmp1		x3
52#define wtmp2		w4
53#define tmp3		x5
54#define src_match	x6
55#define src_offset	x7
56#define const_m1	x8
57#define tmp4		x9
58#define nul_match	x10
59#define chr_match	x11
60
61#define vrepchr		v0
62#define vdata1		v1
63#define vdata2		v2
64#define vhas_nul1	v3
65#define vhas_nul2	v4
66#define vhas_chr1	v5
67#define vhas_chr2	v6
68#define vrepmask_0	v7
69#define vrepmask_c	v16
70#define vend1		v17
71#define vend2		v18
72
73/* Core algorithm.
74
75   For each 32-byte hunk we calculate a 64-bit syndrome value, with
76   two bits per byte (LSB is always in bits 0 and 1, for both big
77   and little-endian systems).  For each tuple, bit 0 is set iff
78   the relevant byte matched the requested character; bit 1 is set
79   iff the relevant byte matched the NUL end of string (we trigger
80   off bit0 for the special case of looking for NUL).  Since the bits
81   in the syndrome reflect exactly the order in which things occur
82   in the original string a count_trailing_zeros() operation will
83   identify exactly which byte is causing the termination, and why.  */
84
85ENTRY (strrchr)
86	PTR_ARG (0)
87	/* Magic constant 0x40100401 to allow us to identify which lane
88	   matches the requested byte.  Magic constant 0x80200802 used
89	   similarly for NUL termination.  */
90	mov	wtmp2, #0x0401
91	movk	wtmp2, #0x4010, lsl #16
92	dup	vrepchr.16b, chrin
93	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
94	dup	vrepmask_c.4s, wtmp2
95	mov	src_offset, #0
96	ands	tmp1, srcin, #31
97	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
98	b.eq	L(aligned)
99
100	/* Input string is not 32-byte aligned.  Rather than forcing
101	   the padding bytes to a safe value, we calculate the syndrome
102	   for all the bytes, but then mask off those bits of the
103	   syndrome that are related to the padding.  */
104	ld1	{vdata1.16b, vdata2.16b}, [src], #32
105	neg	tmp1, tmp1
106	cmeq	vhas_nul1.16b, vdata1.16b, #0
107	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
108	cmeq	vhas_nul2.16b, vdata2.16b, #0
109	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
110	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
111	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
112	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
113	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
114	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
115	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
116	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
117	mov	nul_match, vend1.d[0]
118	lsl	tmp1, tmp1, #1
119	mov	const_m1, #~0
120	lsr	tmp3, const_m1, tmp1
121	mov	chr_match, vend1.d[1]
122
123	bic	nul_match, nul_match, tmp3	// Mask padding bits.
124	bic	chr_match, chr_match, tmp3	// Mask padding bits.
125	cbnz	nul_match, L(tail)
126
127	.p2align 4
128L(loop):
129	cmp	chr_match, #0
130	csel	src_match, src, src_match, ne
131	csel	src_offset, chr_match, src_offset, ne
132L(aligned):
133	ld1	{vdata1.16b, vdata2.16b}, [src], #32
134	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
135	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
136	uminp	vend1.16b, vdata1.16b, vdata2.16b
137	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
138	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
139	cmeq	vend1.16b, vend1.16b, 0
140	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
141	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
142	mov	nul_match, vend1.d[0]
143	mov	chr_match, vend1.d[1]
144	cbz	nul_match, L(loop)
145
146	cmeq	vhas_nul1.16b, vdata1.16b, #0
147	cmeq	vhas_nul2.16b, vdata2.16b, #0
148	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
149	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
150	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
151	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
152	mov	nul_match, vhas_nul1.d[0]
153
154L(tail):
155	/* Work out exactly where the string ends.  */
156	sub	tmp4, nul_match, #1
157	eor	tmp4, tmp4, nul_match
158	ands	chr_match, chr_match, tmp4
159	/* And pick the values corresponding to the last match.  */
160	csel	src_match, src, src_match, ne
161	csel	src_offset, chr_match, src_offset, ne
162
163	/* Count down from the top of the syndrome to find the last match.  */
164	clz	tmp3, src_offset
165	/* Src_match points beyond the word containing the match, so we can
166	   simply subtract half the bit-offset into the syndrome.  Because
167	   we are counting down, we need to go back one more character.  */
168	add	tmp3, tmp3, #2
169	sub	result, src_match, tmp3, lsr #1
170	/* But if the syndrome shows no match was found, then return NULL.  */
171	cmp	src_offset, #0
172	csel	result, result, xzr, ne
173
174	ret
175
176END (strrchr)
177#endif
178