1/* Copyright (c) 2013-2015, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7	 notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9	 notice, this list of conditions and the following disclaimer in the
10	 documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12	 names of its contributors may be used to endorse or promote products
13	 derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26#include <picolibc.h>
27
28#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) || !defined(__ARM_NEON)
29/* See strlen-stub.c  */
30#else
31
32/* Assumptions:
33 *
34 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
35 * Not MTE compatible.
36 */
37
38#include "asmdefs.h"
39
40#define srcin	x0
41#define len	x0
42
43#define src	x1
44#define data1	x2
45#define data2	x3
46#define has_nul1 x4
47#define has_nul2 x5
48#define tmp1	x4
49#define tmp2	x5
50#define tmp3	x6
51#define tmp4	x7
52#define zeroones x8
53
54#define maskv	v0
55#define maskd	d0
56#define dataq1	q1
57#define dataq2	q2
58#define datav1	v1
59#define datav2	v2
60#define tmp	x2
61#define tmpw	w2
62#define synd	x3
63#define syndw	w3
64#define shift	x4
65
66/* For the first 32 bytes, NUL detection works on the principle that
67   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
68   byte is zero, and can be done in parallel across the entire word.  */
69
70#define REP8_01 0x0101010101010101
71#define REP8_7f 0x7f7f7f7f7f7f7f7f
72
73/* To test the page crossing code path more thoroughly, compile with
74   -DTEST_PAGE_CROSS - this will force all calls through the slower
75   entry path.  This option is not intended for production use.  */
76
77#ifdef TEST_PAGE_CROSS
78# define MIN_PAGE_SIZE 32
79#else
80# define MIN_PAGE_SIZE 4096
81#endif
82
83/* Core algorithm:
84
85   Since strings are short on average, we check the first 32 bytes of the
86   string for a NUL character without aligning the string.  In order to use
87   unaligned loads safely we must do a page cross check first.
88
89   If there is a NUL byte we calculate the length from the 2 8-byte words
90   using conditional select to reduce branch mispredictions (it is unlikely
91   strlen will be repeatedly called on strings with the same length).
92
93   If the string is longer than 32 bytes, align src so we don't need further
94   page cross checks, and process 32 bytes per iteration using a fast SIMD
95   loop.
96
97   If the page cross check fails, we read 32 bytes from an aligned address,
98   and ignore any characters before the string.  If it contains a NUL
99   character, return the length, if not, continue in the main loop.  */
100
101ENTRY (strlen)
102	PTR_ARG (0)
103	and	tmp1, srcin, MIN_PAGE_SIZE - 1
104	cmp	tmp1, MIN_PAGE_SIZE - 32
105	b.hi	L(page_cross)
106
107	/* Look for a NUL byte in the first 16 bytes.  */
108	ldp	data1, data2, [srcin]
109	mov	zeroones, REP8_01
110
111#ifdef __AARCH64EB__
112	/* For big-endian, carry propagation (if the final byte in the
113	   string is 0x01) means we cannot use has_nul1/2 directly.
114	   Since we expect strings to be small and early-exit,
115	   byte-swap the data now so has_null1/2 will be correct.  */
116	rev	data1, data1
117	rev	data2, data2
118#endif
119	sub	tmp1, data1, zeroones
120	orr	tmp2, data1, REP8_7f
121	sub	tmp3, data2, zeroones
122	orr	tmp4, data2, REP8_7f
123	bics	has_nul1, tmp1, tmp2
124	bic	has_nul2, tmp3, tmp4
125	ccmp	has_nul2, 0, 0, eq
126	b.eq	L(bytes16_31)
127
128	/* Find the exact offset of the first NUL byte in the first 16 bytes
129	   from the string start.  Enter with C = has_nul1 == 0.  */
130	csel	has_nul1, has_nul1, has_nul2, cc
131	mov	len, 8
132	rev	has_nul1, has_nul1
133	csel	len, xzr, len, cc
134	clz	tmp1, has_nul1
135	add	len, len, tmp1, lsr 3
136	ret
137
138	/* Look for a NUL byte at offset 16..31 in the string.  */
139L(bytes16_31):
140	ldp	data1, data2, [srcin, 16]
141#ifdef __AARCH64EB__
142	rev	data1, data1
143	rev	data2, data2
144#endif
145	sub	tmp1, data1, zeroones
146	orr	tmp2, data1, REP8_7f
147	sub	tmp3, data2, zeroones
148	orr	tmp4, data2, REP8_7f
149	bics	has_nul1, tmp1, tmp2
150	bic	has_nul2, tmp3, tmp4
151	ccmp	has_nul2, 0, 0, eq
152	b.eq	L(loop_entry)
153
154	/* Find the exact offset of the first NUL byte at offset 16..31 from
155	   the string start.  Enter with C = has_nul1 == 0.  */
156	csel	has_nul1, has_nul1, has_nul2, cc
157	mov	len, 24
158	rev	has_nul1, has_nul1
159	mov	tmp3, 16
160	clz	tmp1, has_nul1
161	csel	len, tmp3, len, cc
162	add	len, len, tmp1, lsr 3
163	ret
164
165	nop
166L(loop_entry):
167	bic	src, srcin, 31
168
169	.p2align 5
170L(loop):
171	ldp	dataq1, dataq2, [src, 32]!
172	uminp	maskv.16b, datav1.16b, datav2.16b
173	uminp	maskv.16b, maskv.16b, maskv.16b
174	cmeq	maskv.8b, maskv.8b, 0
175	fmov	synd, maskd
176	cbz	synd, L(loop)
177
178	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
179	cmeq	maskv.16b, datav1.16b, 0
180	sub	len, src, srcin
181	cbnz	syndw, 1f
182	cmeq	maskv.16b, datav2.16b, 0
183	add	len, len, 16
1841:
185	/* Generate a bitmask and compute correct byte offset.  */
186	shrn	maskv.8b, maskv.8h, 4
187	fmov	synd, maskd
188#ifndef __AARCH64EB__
189	rbit	synd, synd
190#endif
191	clz	tmp, synd
192	add	len, len, tmp, lsr 2
193	ret
194
195L(page_cross):
196	bic	src, srcin, 31
197	mov	tmpw, 0x0c03
198	movk	tmpw, 0xc030, lsl 16
199	ld1	{datav1.16b, datav2.16b}, [src]
200	dup	maskv.4s, tmpw
201	cmeq	datav1.16b, datav1.16b, 0
202	cmeq	datav2.16b, datav2.16b, 0
203	and	datav1.16b, datav1.16b, maskv.16b
204	and	datav2.16b, datav2.16b, maskv.16b
205	addp	maskv.16b, datav1.16b, datav2.16b
206	addp	maskv.16b, maskv.16b, maskv.16b
207	fmov	synd, maskd
208	lsl	shift, srcin, 1
209	lsr	synd, synd, shift
210	cbz	synd, L(loop)
211
212	rbit	synd, synd
213	clz	len, synd
214	lsr	len, len, 1
215	ret
216
217END (strlen)
218#endif
219