1/*
2   strrchr - find last instance of a character in a string
3
4   Copyright (c) 2014, ARM Limited
5   All rights Reserved.
6
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions are met:
9       * Redistributions of source code must retain the above copyright
10         notice, this list of conditions and the following disclaimer.
11       * Redistributions in binary form must reproduce the above copyright
12         notice, this list of conditions and the following disclaimer in the
13         documentation and/or other materials provided with the distribution.
14       * Neither the name of the company nor the names of its contributors
15         may be used to endorse or promote products derived from this
16         software without specific prior written permission.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
29
30#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__)
31/* See strchr-stub.c  */
32#else
33
34/* Assumptions:
35 *
36 * ARMv8-a, AArch64
37 * Neon Available.
38 */
39
40/* Arguments and results.  */
41#define srcin		x0
42#define chrin		w1
43
44#define result		x0
45
46#define src		x2
47#define	tmp1		x3
48#define wtmp2		w4
49#define tmp3		x5
50#define src_match	x6
51#define src_offset	x7
52#define const_m1	x8
53#define tmp4		x9
54#define nul_match	x10
55#define chr_match	x11
56
57#define vrepchr		v0
58#define vdata1		v1
59#define vdata2		v2
60#define vhas_nul1	v3
61#define vhas_nul2	v4
62#define vhas_chr1	v5
63#define vhas_chr2	v6
64#define vrepmask_0	v7
65#define vrepmask_c	v16
66#define vend1		v17
67#define vend2		v18
68
69/* Core algorithm.
70
71   For each 32-byte hunk we calculate a 64-bit syndrome value, with
72   two bits per byte (LSB is always in bits 0 and 1, for both big
73   and little-endian systems).  For each tuple, bit 0 is set iff
74   the relevant byte matched the requested character; bit 1 is set
75   iff the relevant byte matched the NUL end of string (we trigger
76   off bit0 for the special case of looking for NUL).  Since the bits
77   in the syndrome reflect exactly the order in which things occur
78   in the original string a count_trailing_zeros() operation will
79   identify exactly which byte is causing the termination, and why.  */
80
81/* Locals and temporaries.  */
82
83	.macro def_fn f p2align=0
84	.text
85	.p2align \p2align
86	.global \f
87	.type \f, %function
88\f:
89	.endm
90
91def_fn strrchr
92	/* Magic constant 0x40100401 to allow us to identify which lane
93	   matches the requested byte.  Magic constant 0x80200802 used
94	   similarly for NUL termination.  */
95	mov	wtmp2, #0x0401
96	movk	wtmp2, #0x4010, lsl #16
97	dup	vrepchr.16b, chrin
98	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
99	dup	vrepmask_c.4s, wtmp2
100	mov	src_offset, #0
101	ands	tmp1, srcin, #31
102	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
103	b.eq	.Laligned
104
105	/* Input string is not 32-byte aligned.  Rather than forcing
106	   the padding bytes to a safe value, we calculate the syndrome
107	   for all the bytes, but then mask off those bits of the
108	   syndrome that are related to the padding.  */
109	ld1	{vdata1.16b, vdata2.16b}, [src], #32
110	neg	tmp1, tmp1
111	cmeq	vhas_nul1.16b, vdata1.16b, #0
112	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
113	cmeq	vhas_nul2.16b, vdata2.16b, #0
114	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
115	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
116	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
117	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
118	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
119	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
120	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
121	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b	// 128->64
122	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
123	mov	nul_match, vhas_nul1.d[0]
124	lsl	tmp1, tmp1, #1
125	mov	const_m1, #~0
126	mov	chr_match, vhas_chr1.d[0]
127	lsr	tmp3, const_m1, tmp1
128
129	bic	nul_match, nul_match, tmp3	// Mask padding bits.
130	bic	chr_match, chr_match, tmp3	// Mask padding bits.
131	cbnz	nul_match, .Ltail
132
133.Lloop:
134	cmp	chr_match, #0
135	csel	src_match, src, src_match, ne
136	csel	src_offset, chr_match, src_offset, ne
137.Laligned:
138	ld1	{vdata1.16b, vdata2.16b}, [src], #32
139	cmeq	vhas_nul1.16b, vdata1.16b, #0
140	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
141	cmeq	vhas_nul2.16b, vdata2.16b, #0
142	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
143	addp	vend1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
144	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
145	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
146	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
147	addp	vend1.16b, vend1.16b, vend1.16b	// 128->64
148	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr1.16b	// 128->64
149	mov	nul_match, vend1.d[0]
150	mov	chr_match, vhas_chr1.d[0]
151	cbz	nul_match, .Lloop
152
153	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
154	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
155	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
156	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
157	mov	nul_match, vhas_nul1.d[0]
158
159.Ltail:
160	/* Work out exactly where the string ends.  */
161	sub	tmp4, nul_match, #1
162	eor	tmp4, tmp4, nul_match
163	ands	chr_match, chr_match, tmp4
164	/* And pick the values corresponding to the last match.  */
165	csel	src_match, src, src_match, ne
166	csel	src_offset, chr_match, src_offset, ne
167
168	/* Count down from the top of the syndrome to find the last match.  */
169	clz	tmp3, src_offset
170	/* Src_match points beyond the word containing the match, so we can
171	   simply subtract half the bit-offset into the syndrome.  Because
172	   we are counting down, we need to go back one more character.  */
173	add	tmp3, tmp3, #2
174	sub	result, src_match, tmp3, lsr #1
175	/* But if the syndrome shows no match was found, then return NULL.  */
176	cmp	src_offset, #0
177	csel	result, result, xzr, ne
178
179	ret
180
181	.size	strrchr, . - strrchr
182#endif
183