1/* Copyright (c) 2013, 2018, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27#include <picolibc.h>
28
29#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__)
30/* See strncmp-stub.c  */
31#else
32
33/* Assumptions:
34 *
35 * ARMv8-a, AArch64.
36 * MTE compatible.
37 */
38
39#include "asmdefs.h"
40
41#define REP8_01 0x0101010101010101
42#define REP8_7f 0x7f7f7f7f7f7f7f7f
43
44/* Parameters and result.  */
45#define src1		x0
46#define src2		x1
47#define limit		x2
48#define result		x0
49
50/* Internal variables.  */
51#define data1		x3
52#define data1w		w3
53#define data2		x4
54#define data2w		w4
55#define has_nul		x5
56#define diff		x6
57#define syndrome	x7
58#define tmp1		x8
59#define tmp2		x9
60#define tmp3		x10
61#define zeroones	x11
62#define pos		x12
63#define mask		x13
64#define endloop		x14
65#define count		mask
66#define offset		pos
67#define neg_offset	x15
68
69/* Define endian dependent shift operations.
70   On big-endian early bytes are at MSB and on little-endian LSB.
71   LS_FW means shifting towards early bytes.
72   LS_BK means shifting towards later bytes.
73   */
74#ifdef __AARCH64EB__
75#define LS_FW lsl
76#define LS_BK lsr
77#else
78#define LS_FW lsr
79#define LS_BK lsl
80#endif
81
82ENTRY (strncmp)
83	PTR_ARG (0)
84	PTR_ARG (1)
85	SIZE_ARG (2)
86	cbz	limit, L(ret0)
87	eor	tmp1, src1, src2
88	mov	zeroones, #REP8_01
89	tst	tmp1, #7
90	and	count, src1, #7
91	b.ne	L(misaligned8)
92	cbnz	count, L(mutual_align)
93
94	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
95	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
96	   can be done in parallel across the entire word.  */
97	.p2align 4
98L(loop_aligned):
99	ldr	data1, [src1], #8
100	ldr	data2, [src2], #8
101L(start_realigned):
102	subs	limit, limit, #8
103	sub	tmp1, data1, zeroones
104	orr	tmp2, data1, #REP8_7f
105	eor	diff, data1, data2	/* Non-zero if differences found.  */
106	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
107	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
108	ccmp	endloop, #0, #0, eq
109	b.eq	L(loop_aligned)
110	/* End of main loop */
111
112L(full_check):
113#ifndef __AARCH64EB__
114	orr	syndrome, diff, has_nul
115	add	limit, limit, 8	/* Rewind limit to before last subs. */
116L(syndrome_check):
117	/* Limit was reached. Check if the NUL byte or the difference
118	   is before the limit. */
119	rev	syndrome, syndrome
120	rev	data1, data1
121	clz	pos, syndrome
122	rev	data2, data2
123	lsl	data1, data1, pos
124	cmp	limit, pos, lsr #3
125	lsl	data2, data2, pos
126	/* But we need to zero-extend (char is unsigned) the value and then
127	   perform a signed 32-bit subtraction.  */
128	lsr	data1, data1, #56
129	sub	result, data1, data2, lsr #56
130	csel result, result, xzr, hi
131	ret
132#else
133	/* Not reached the limit, must have found the end or a diff.  */
134	tbz	limit, #63, L(not_limit)
135	add	tmp1, limit, 8
136	cbz	limit, L(not_limit)
137
138	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
139	mov	mask, #~0
140	lsr	mask, mask, limit
141	bic	data1, data1, mask
142	bic	data2, data2, mask
143
144	/* Make sure that the NUL byte is marked in the syndrome.  */
145	orr	has_nul, has_nul, mask
146
147L(not_limit):
148	/* For big-endian we cannot use the trick with the syndrome value
149	   as carry-propagation can corrupt the upper bits if the trailing
150	   bytes in the string contain 0x01.  */
151	/* However, if there is no NUL byte in the dword, we can generate
152	   the result directly.  We can't just subtract the bytes as the
153	   MSB might be significant.  */
154	cbnz	has_nul, 1f
155	cmp	data1, data2
156	cset	result, ne
157	cneg	result, result, lo
158	ret
1591:
160	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
161	rev	tmp3, data1
162	sub	tmp1, tmp3, zeroones
163	orr	tmp2, tmp3, #REP8_7f
164	bic	has_nul, tmp1, tmp2
165	rev	has_nul, has_nul
166	orr	syndrome, diff, has_nul
167	clz	pos, syndrome
168	/* The most-significant-non-zero bit of the syndrome marks either the
169	   first bit that is different, or the top bit of the first zero byte.
170	   Shifting left now will bring the critical information into the
171	   top bits.  */
172L(end_quick):
173	lsl	data1, data1, pos
174	lsl	data2, data2, pos
175	/* But we need to zero-extend (char is unsigned) the value and then
176	   perform a signed 32-bit subtraction.  */
177	lsr	data1, data1, #56
178	sub	result, data1, data2, lsr #56
179	ret
180#endif
181
182L(mutual_align):
183	/* Sources are mutually aligned, but are not currently at an
184	   alignment boundary.  Round down the addresses and then mask off
185	   the bytes that precede the start point.
186	   We also need to adjust the limit calculations, but without
187	   overflowing if the limit is near ULONG_MAX.  */
188	bic	src1, src1, #7
189	bic	src2, src2, #7
190	ldr	data1, [src1], #8
191	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
192	ldr	data2, [src2], #8
193	mov	tmp2, #~0
194	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
195	/* Adjust the limit and ensure it doesn't overflow.  */
196	adds	limit, limit, count
197	csinv	limit, limit, xzr, lo
198	orr	data1, data1, tmp2
199	orr	data2, data2, tmp2
200	b	L(start_realigned)
201
202	.p2align 4
203	/* Don't bother with dwords for up to 16 bytes.  */
204L(misaligned8):
205	cmp	limit, #16
206	b.hs	L(try_misaligned_words)
207
208L(byte_loop):
209	/* Perhaps we can do better than this.  */
210	ldrb	data1w, [src1], #1
211	ldrb	data2w, [src2], #1
212	subs	limit, limit, #1
213	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
214	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
215	b.eq	L(byte_loop)
216L(done):
217	sub	result, data1, data2
218	ret
219	/* Align the SRC1 to a dword by doing a bytewise compare and then do
220	   the dword loop.  */
221L(try_misaligned_words):
222	cbz	count, L(src1_aligned)
223
224	neg	count, count
225	and	count, count, #7
226	sub	limit, limit, count
227
228L(page_end_loop):
229	ldrb	data1w, [src1], #1
230	ldrb	data2w, [src2], #1
231	cmp	data1w, #1
232	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
233	b.ne	L(done)
234	subs	count, count, #1
235	b.hi	L(page_end_loop)
236
237	/* The following diagram explains the comparison of misaligned strings.
238	   The bytes are shown in natural order. For little-endian, it is
239	   reversed in the registers. The "x" bytes are before the string.
240	   The "|" separates data that is loaded at one time.
241	   src1     | a a a a a a a a | b b b c c c c c | . . .
242	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
243
244	   After shifting in each step, the data looks like this:
245	                STEP_A              STEP_B              STEP_C
246	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
247	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
248
249	   The bytes with "0" are eliminated from the syndrome via mask.
250
251	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
252	   time from SRC2. The comparison happens in 3 steps. After each step
253	   the loop can exit, or read from SRC1 or SRC2. */
254L(src1_aligned):
255	/* Calculate offset from 8 byte alignment to string start in bits. No
256	   need to mask offset since shifts are ignoring upper bits. */
257	lsl	offset, src2, #3
258	bic	src2, src2, #0xf
259	mov	mask, -1
260	neg	neg_offset, offset
261	ldr	data1, [src1], #8
262	ldp	tmp1, tmp2, [src2], #16
263	LS_BK	mask, mask, neg_offset
264	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
265	/* Skip the first compare if data in tmp1 is irrelevant. */
266	tbnz	offset, 6, L(misaligned_mid_loop)
267
268L(loop_misaligned):
269	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
270	LS_FW	data2, tmp1, offset
271	LS_BK	tmp1, tmp2, neg_offset
272	subs	limit, limit, #8
273	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
274	sub	has_nul, data1, zeroones
275	eor	diff, data1, data2	/* Non-zero if differences found.  */
276	orr	tmp3, data1, #REP8_7f
277	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
278	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
279	orr	tmp3, endloop, has_nul
280	cbnz	tmp3, L(full_check)
281
282	ldr	data1, [src1], #8
283L(misaligned_mid_loop):
284	/* STEP_B: Compare first part of data1 to second part of tmp2. */
285	LS_FW	data2, tmp2, offset
286#ifdef __AARCH64EB__
287	/* For big-endian we do a byte reverse to avoid carry-propagation
288	problem described above. This way we can reuse the has_nul in the
289	next step and also use syndrome value trick at the end. */
290	rev	tmp3, data1
291	#define data1_fixed tmp3
292#else
293	#define data1_fixed data1
294#endif
295	sub	has_nul, data1_fixed, zeroones
296	orr	tmp3, data1_fixed, #REP8_7f
297	eor	diff, data2, data1	/* Non-zero if differences found.  */
298	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
299#ifdef __AARCH64EB__
300	rev	has_nul, has_nul
301#endif
302	cmp	limit, neg_offset, lsr #3
303	orr	syndrome, diff, has_nul
304	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
305	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
306	cbnz	tmp3, L(syndrome_check)
307
308	/* STEP_C: Compare second part of data1 to first part of tmp1. */
309	ldp	tmp1, tmp2, [src2], #16
310	cmp	limit, #8
311	LS_BK	data2, tmp1, neg_offset
312	eor	diff, data2, data1	/* Non-zero if differences found.  */
313	orr	syndrome, diff, has_nul
314	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
315	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
316	cbnz	tmp3, L(syndrome_check)
317
318	ldr	data1, [src1], #8
319	sub	limit, limit, #8
320	b	L(loop_misaligned)
321
322#ifdef	__AARCH64EB__
323L(syndrome_check):
324	clz	pos, syndrome
325	cmp	pos, limit, lsl #3
326	b.lo	L(end_quick)
327#endif
328
329L(ret0):
330	mov	result, #0
331	ret
332END(strncmp)
333#endif
334