1/* Copyright (c) 2013-2015, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7	 notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9	 notice, this list of conditions and the following disclaimer in the
10	 documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12	 names of its contributors may be used to endorse or promote products
13	 derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
27/* See strlen-stub.c  */
28#else
29
30/* Assumptions:
31 *
32 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
33 * Not MTE compatible.
34 */
35
36#include "asmdefs.h"
37
38#define srcin	x0
39#define len	x0
40
41#define src	x1
42#define data1	x2
43#define data2	x3
44#define has_nul1 x4
45#define has_nul2 x5
46#define tmp1	x4
47#define tmp2	x5
48#define tmp3	x6
49#define tmp4	x7
50#define zeroones x8
51
52#define maskv	v0
53#define maskd	d0
54#define dataq1	q1
55#define dataq2	q2
56#define datav1	v1
57#define datav2	v2
58#define tmp	x2
59#define tmpw	w2
60#define synd	x3
61#define syndw	w3
62#define shift	x4
63
64/* For the first 32 bytes, NUL detection works on the principle that
65   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
66   byte is zero, and can be done in parallel across the entire word.  */
67
68#define REP8_01 0x0101010101010101
69#define REP8_7f 0x7f7f7f7f7f7f7f7f
70
71/* To test the page crossing code path more thoroughly, compile with
72   -DTEST_PAGE_CROSS - this will force all calls through the slower
73   entry path.  This option is not intended for production use.  */
74
75#ifdef TEST_PAGE_CROSS
76# define MIN_PAGE_SIZE 32
77#else
78# define MIN_PAGE_SIZE 4096
79#endif
80
81/* Core algorithm:
82
83   Since strings are short on average, we check the first 32 bytes of the
84   string for a NUL character without aligning the string.  In order to use
85   unaligned loads safely we must do a page cross check first.
86
87   If there is a NUL byte we calculate the length from the 2 8-byte words
88   using conditional select to reduce branch mispredictions (it is unlikely
89   strlen will be repeatedly called on strings with the same length).
90
91   If the string is longer than 32 bytes, align src so we don't need further
92   page cross checks, and process 32 bytes per iteration using a fast SIMD
93   loop.
94
95   If the page cross check fails, we read 32 bytes from an aligned address,
96   and ignore any characters before the string.  If it contains a NUL
97   character, return the length, if not, continue in the main loop.  */
98
99ENTRY (strlen)
100	PTR_ARG (0)
101	and	tmp1, srcin, MIN_PAGE_SIZE - 1
102	cmp	tmp1, MIN_PAGE_SIZE - 32
103	b.hi	L(page_cross)
104
105	/* Look for a NUL byte in the first 16 bytes.  */
106	ldp	data1, data2, [srcin]
107	mov	zeroones, REP8_01
108
109#ifdef __AARCH64EB__
110	/* For big-endian, carry propagation (if the final byte in the
111	   string is 0x01) means we cannot use has_nul1/2 directly.
112	   Since we expect strings to be small and early-exit,
113	   byte-swap the data now so has_null1/2 will be correct.  */
114	rev	data1, data1
115	rev	data2, data2
116#endif
117	sub	tmp1, data1, zeroones
118	orr	tmp2, data1, REP8_7f
119	sub	tmp3, data2, zeroones
120	orr	tmp4, data2, REP8_7f
121	bics	has_nul1, tmp1, tmp2
122	bic	has_nul2, tmp3, tmp4
123	ccmp	has_nul2, 0, 0, eq
124	b.eq	L(bytes16_31)
125
126	/* Find the exact offset of the first NUL byte in the first 16 bytes
127	   from the string start.  Enter with C = has_nul1 == 0.  */
128	csel	has_nul1, has_nul1, has_nul2, cc
129	mov	len, 8
130	rev	has_nul1, has_nul1
131	csel	len, xzr, len, cc
132	clz	tmp1, has_nul1
133	add	len, len, tmp1, lsr 3
134	ret
135
136	/* Look for a NUL byte at offset 16..31 in the string.  */
137L(bytes16_31):
138	ldp	data1, data2, [srcin, 16]
139#ifdef __AARCH64EB__
140	rev	data1, data1
141	rev	data2, data2
142#endif
143	sub	tmp1, data1, zeroones
144	orr	tmp2, data1, REP8_7f
145	sub	tmp3, data2, zeroones
146	orr	tmp4, data2, REP8_7f
147	bics	has_nul1, tmp1, tmp2
148	bic	has_nul2, tmp3, tmp4
149	ccmp	has_nul2, 0, 0, eq
150	b.eq	L(loop_entry)
151
152	/* Find the exact offset of the first NUL byte at offset 16..31 from
153	   the string start.  Enter with C = has_nul1 == 0.  */
154	csel	has_nul1, has_nul1, has_nul2, cc
155	mov	len, 24
156	rev	has_nul1, has_nul1
157	mov	tmp3, 16
158	clz	tmp1, has_nul1
159	csel	len, tmp3, len, cc
160	add	len, len, tmp1, lsr 3
161	ret
162
163	nop
164L(loop_entry):
165	bic	src, srcin, 31
166
167	.p2align 5
168L(loop):
169	ldp	dataq1, dataq2, [src, 32]!
170	uminp	maskv.16b, datav1.16b, datav2.16b
171	uminp	maskv.16b, maskv.16b, maskv.16b
172	cmeq	maskv.8b, maskv.8b, 0
173	fmov	synd, maskd
174	cbz	synd, L(loop)
175
176	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
177	cmeq	maskv.16b, datav1.16b, 0
178	sub	len, src, srcin
179	cbnz	syndw, 1f
180	cmeq	maskv.16b, datav2.16b, 0
181	add	len, len, 16
1821:
183	/* Generate a bitmask and compute correct byte offset.  */
184	shrn	maskv.8b, maskv.8h, 4
185	fmov	synd, maskd
186#ifndef __AARCH64EB__
187	rbit	synd, synd
188#endif
189	clz	tmp, synd
190	add	len, len, tmp, lsr 2
191	ret
192
193L(page_cross):
194	bic	src, srcin, 31
195	mov	tmpw, 0x0c03
196	movk	tmpw, 0xc030, lsl 16
197	ld1	{datav1.16b, datav2.16b}, [src]
198	dup	maskv.4s, tmpw
199	cmeq	datav1.16b, datav1.16b, 0
200	cmeq	datav2.16b, datav2.16b, 0
201	and	datav1.16b, datav1.16b, maskv.16b
202	and	datav2.16b, datav2.16b, maskv.16b
203	addp	maskv.16b, datav1.16b, datav2.16b
204	addp	maskv.16b, maskv.16b, maskv.16b
205	fmov	synd, maskd
206	lsl	shift, srcin, 1
207	lsr	synd, synd, shift
208	cbz	synd, L(loop)
209
210	rbit	synd, synd
211	clz	len, synd
212	lsr	len, len, 1
213	ret
214
215END (strlen)
216#endif
217