1/* strnlen - calculate the length of a string with limit.
2
3   Copyright (c) 2013, Linaro Limited
4   All rights reserved.
5
6   Redistribution and use in source and binary forms, with or without
7   modification, are permitted provided that the following conditions are met:
8       * Redistributions of source code must retain the above copyright
9         notice, this list of conditions and the following disclaimer.
10       * Redistributions in binary form must reproduce the above copyright
11         notice, this list of conditions and the following disclaimer in the
12         documentation and/or other materials provided with the distribution.
13       * Neither the name of the Linaro nor the
14         names of its contributors may be used to endorse or promote products
15         derived from this software without specific prior written permission.
16
17   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
28
29#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__)
30/* See strlen-stub.c  */
31#else
32
33/* Assumptions:
34 *
35 * ARMv8-a, AArch64
36 */
37
38/* Arguments and results.  */
39#define srcin		x0
40#define len		x0
41#define limit		x1
42
43/* Locals and temporaries.  */
44#define src		x2
45#define data1		x3
46#define data2		x4
47#define data2a		x5
48#define has_nul1	x6
49#define has_nul2	x7
50#define tmp1		x8
51#define tmp2		x9
52#define tmp3		x10
53#define tmp4		x11
54#define zeroones	x12
55#define pos		x13
56#define limit_wd	x14
57
58	.macro def_fn f p2align=0
59	.text
60	.p2align \p2align
61	.global \f
62	.type \f, %function
63\f:
64	.endm
65
66#define REP8_01 0x0101010101010101
67#define REP8_7f 0x7f7f7f7f7f7f7f7f
68#define REP8_80 0x8080808080808080
69
70	.text
71	.p2align	6
72.Lstart:
73	/* Pre-pad to ensure critical loop begins an icache line.  */
74	.rep 7
75	nop
76	.endr
77	/* Put this code here to avoid wasting more space with pre-padding.  */
78.Lhit_limit:
79	mov	len, limit
80	ret
81
82def_fn strnlen
83	cbz	limit, .Lhit_limit
84	mov	zeroones, #REP8_01
85	bic	src, srcin, #15
86	ands	tmp1, srcin, #15
87	b.ne	.Lmisaligned
88	/* Calculate the number of full and partial words -1.  */
89	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
90	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
91
92	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
93	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
94	   can be done in parallel across the entire word.  */
95	/* The inner loop deals with two Dwords at a time.  This has a
96	   slightly higher start-up cost, but we should win quite quickly,
97	   especially on cores with a high number of issue slots per
98	   cycle, as we get much better parallelism out of the operations.  */
99
100	/* Start of critial section -- keep to one 64Byte cache line.  */
101.Lloop:
102	ldp	data1, data2, [src], #16
103.Lrealigned:
104	sub	tmp1, data1, zeroones
105	orr	tmp2, data1, #REP8_7f
106	sub	tmp3, data2, zeroones
107	orr	tmp4, data2, #REP8_7f
108	bic	has_nul1, tmp1, tmp2
109	bic	has_nul2, tmp3, tmp4
110	subs	limit_wd, limit_wd, #1
111	orr	tmp1, has_nul1, has_nul2
112	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
113	b.eq	.Lloop
114	/* End of critical section -- keep to one 64Byte cache line.  */
115
116	orr	tmp1, has_nul1, has_nul2
117	cbz	tmp1, .Lhit_limit	/* No null in final Qword.  */
118
119	/* We know there's a null in the final Qword.  The easiest thing
120	   to do now is work out the length of the string and return
121	   MIN (len, limit).  */
122
123	sub	len, src, srcin
124	cbz	has_nul1, .Lnul_in_data2
125#ifdef __AARCH64EB__
126	mov	data2, data1
127#endif
128	sub	len, len, #8
129	mov	has_nul2, has_nul1
130.Lnul_in_data2:
131#ifdef __AARCH64EB__
132	/* For big-endian, carry propagation (if the final byte in the
133	   string is 0x01) means we cannot use has_nul directly.  The
134	   easiest way to get the correct byte is to byte-swap the data
135	   and calculate the syndrome a second time.  */
136	rev	data2, data2
137	sub	tmp1, data2, zeroones
138	orr	tmp2, data2, #REP8_7f
139	bic	has_nul2, tmp1, tmp2
140#endif
141	sub	len, len, #8
142	rev	has_nul2, has_nul2
143	clz	pos, has_nul2
144	add	len, len, pos, lsr #3		/* Bits to bytes.  */
145	cmp	len, limit
146	csel	len, len, limit, ls		/* Return the lower value.  */
147	ret
148
149.Lmisaligned:
150	/* Deal with a partial first word.
151	   We're doing two things in parallel here;
152	   1) Calculate the number of words (but avoiding overflow if
153	      limit is near ULONG_MAX) - to do this we need to work out
154	      limit + tmp1 - 1 as a 65-bit value before shifting it;
155	   2) Load and mask the initial data words - we force the bytes
156	      before the ones we are interested in to 0xff - this ensures
157	      early bytes will not hit any zero detection.  */
158	sub	limit_wd, limit, #1
159	neg	tmp4, tmp1
160	cmp	tmp1, #8
161
162	and	tmp3, limit_wd, #15
163	lsr	limit_wd, limit_wd, #4
164	mov	tmp2, #~0
165
166	ldp	data1, data2, [src], #16
167	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
168	add	tmp3, tmp3, tmp1
169
170#ifdef __AARCH64EB__
171	/* Big-endian.  Early bytes are at MSB.  */
172	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
173#else
174	/* Little-endian.  Early bytes are at LSB.  */
175	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
176#endif
177	add	limit_wd, limit_wd, tmp3, lsr #4
178
179	orr	data1, data1, tmp2
180	orr	data2a, data2, tmp2
181
182	csinv	data1, data1, xzr, le
183	csel	data2, data2, data2a, le
184	b	.Lrealigned
185	.size	strnlen, . - .Lstart	/* Include pre-padding in size.  */
186
187#endif
188