1/*
2   strrchr - find last instance of a character in a string
3
4   Copyright (c) 2014, ARM Limited
5   All rights Reserved.
6
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions are met:
9       * Redistributions of source code must retain the above copyright
10         notice, this list of conditions and the following disclaimer.
11       * Redistributions in binary form must reproduce the above copyright
12         notice, this list of conditions and the following disclaimer in the
13         documentation and/or other materials provided with the distribution.
14       * Neither the name of the company nor the names of its contributors
15         may be used to endorse or promote products derived from this
16         software without specific prior written permission.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
29
30#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
31/* See strrchr-stub.c  */
32#else
33
34/* Assumptions:
35 *
36 * ARMv8-a, AArch64
37 * Neon Available.
38 */
39
40#include "asmdefs.h"
41
42/* Arguments and results.  */
43#define srcin		x0
44#define chrin		w1
45
46#define result		x0
47
48#define src		x2
49#define	tmp1		x3
50#define wtmp2		w4
51#define tmp3		x5
52#define src_match	x6
53#define src_offset	x7
54#define const_m1	x8
55#define tmp4		x9
56#define nul_match	x10
57#define chr_match	x11
58
59#define vrepchr		v0
60#define vdata1		v1
61#define vdata2		v2
62#define vhas_nul1	v3
63#define vhas_nul2	v4
64#define vhas_chr1	v5
65#define vhas_chr2	v6
66#define vrepmask_0	v7
67#define vrepmask_c	v16
68#define vend1		v17
69#define vend2		v18
70
71/* Core algorithm.
72
73   For each 32-byte hunk we calculate a 64-bit syndrome value, with
74   two bits per byte (LSB is always in bits 0 and 1, for both big
75   and little-endian systems).  For each tuple, bit 0 is set iff
76   the relevant byte matched the requested character; bit 1 is set
77   iff the relevant byte matched the NUL end of string (we trigger
78   off bit0 for the special case of looking for NUL).  Since the bits
79   in the syndrome reflect exactly the order in which things occur
80   in the original string a count_trailing_zeros() operation will
81   identify exactly which byte is causing the termination, and why.  */
82
83ENTRY (strrchr)
84	PTR_ARG (0)
85	/* Magic constant 0x40100401 to allow us to identify which lane
86	   matches the requested byte.  Magic constant 0x80200802 used
87	   similarly for NUL termination.  */
88	mov	wtmp2, #0x0401
89	movk	wtmp2, #0x4010, lsl #16
90	dup	vrepchr.16b, chrin
91	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
92	dup	vrepmask_c.4s, wtmp2
93	mov	src_offset, #0
94	ands	tmp1, srcin, #31
95	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
96	b.eq	L(aligned)
97
98	/* Input string is not 32-byte aligned.  Rather than forcing
99	   the padding bytes to a safe value, we calculate the syndrome
100	   for all the bytes, but then mask off those bits of the
101	   syndrome that are related to the padding.  */
102	ld1	{vdata1.16b, vdata2.16b}, [src], #32
103	neg	tmp1, tmp1
104	cmeq	vhas_nul1.16b, vdata1.16b, #0
105	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
106	cmeq	vhas_nul2.16b, vdata2.16b, #0
107	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
108	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
109	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
110	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
111	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
112	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
113	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
114	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
115	mov	nul_match, vend1.d[0]
116	lsl	tmp1, tmp1, #1
117	mov	const_m1, #~0
118	lsr	tmp3, const_m1, tmp1
119	mov	chr_match, vend1.d[1]
120
121	bic	nul_match, nul_match, tmp3	// Mask padding bits.
122	bic	chr_match, chr_match, tmp3	// Mask padding bits.
123	cbnz	nul_match, L(tail)
124
125	.p2align 4
126L(loop):
127	cmp	chr_match, #0
128	csel	src_match, src, src_match, ne
129	csel	src_offset, chr_match, src_offset, ne
130L(aligned):
131	ld1	{vdata1.16b, vdata2.16b}, [src], #32
132	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
133	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
134	uminp	vend1.16b, vdata1.16b, vdata2.16b
135	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
136	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
137	cmeq	vend1.16b, vend1.16b, 0
138	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
139	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
140	mov	nul_match, vend1.d[0]
141	mov	chr_match, vend1.d[1]
142	cbz	nul_match, L(loop)
143
144	cmeq	vhas_nul1.16b, vdata1.16b, #0
145	cmeq	vhas_nul2.16b, vdata2.16b, #0
146	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
147	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
148	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
149	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
150	mov	nul_match, vhas_nul1.d[0]
151
152L(tail):
153	/* Work out exactly where the string ends.  */
154	sub	tmp4, nul_match, #1
155	eor	tmp4, tmp4, nul_match
156	ands	chr_match, chr_match, tmp4
157	/* And pick the values corresponding to the last match.  */
158	csel	src_match, src, src_match, ne
159	csel	src_offset, chr_match, src_offset, ne
160
161	/* Count down from the top of the syndrome to find the last match.  */
162	clz	tmp3, src_offset
163	/* Src_match points beyond the word containing the match, so we can
164	   simply subtract half the bit-offset into the syndrome.  Because
165	   we are counting down, we need to go back one more character.  */
166	add	tmp3, tmp3, #2
167	sub	result, src_match, tmp3, lsr #1
168	/* But if the syndrome shows no match was found, then return NULL.  */
169	cmp	src_offset, #0
170	csel	result, result, xzr, ne
171
172	ret
173
174END (strrchr)
175#endif
176