1/*
2 * memchr - find a character in a memory zone
3 *
4 * Copyright (c) 2014-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8#include <picolibc.h>
9
10#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) || !defined(__ARM_NEON)
11/* See memchr-stub.c  */
12#else
13/* Assumptions:
14 *
15 * ARMv8-a, AArch64
16 * Neon Available.
17 */
18
19#include "asmdefs.h"
20
21/* Arguments and results.  */
22#define srcin		x0
23#define chrin		w1
24#define cntin		x2
25
26#define result		x0
27
28#define src		x3
29#define	tmp		x4
30#define wtmp2		w5
31#define synd		x6
32#define soff		x9
33#define cntrem		x10
34
35#define vrepchr		v0
36#define vdata1		v1
37#define vdata2		v2
38#define vhas_chr1	v3
39#define vhas_chr2	v4
40#define vrepmask	v5
41#define vend		v6
42
43/*
44 * Core algorithm:
45 *
46 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
47 * per byte. For each tuple, bit 0 is set if the relevant byte matched the
48 * requested character and bit 1 is not used (faster than using a 32bit
49 * syndrome). Since the bits in the syndrome reflect exactly the order in which
50 * things occur in the original string, counting trailing zeros allows to
51 * identify exactly which byte has matched.
52 */
53
54ENTRY (memchr)
55	PTR_ARG (0)
56	SIZE_ARG (2)
57	/* Do not dereference srcin if no bytes to compare.  */
58	cbz	cntin, L(zero_length)
59	/*
60	 * Magic constant 0x40100401 allows us to identify which lane matches
61	 * the requested byte.
62	 */
63	mov	wtmp2, #0x0401
64	movk	wtmp2, #0x4010, lsl #16
65	dup	vrepchr.16b, chrin
66	/* Work with aligned 32-byte chunks */
67	bic	src, srcin, #31
68	dup	vrepmask.4s, wtmp2
69	ands	soff, srcin, #31
70	and	cntrem, cntin, #31
71	b.eq	L(loop)
72
73	/*
74	 * Input string is not 32-byte aligned. We calculate the syndrome
75	 * value for the aligned 32 bytes block containing the first bytes
76	 * and mask the irrelevant part.
77	 */
78
79	ld1	{vdata1.16b, vdata2.16b}, [src], #32
80	sub	tmp, soff, #32
81	adds	cntin, cntin, tmp
82	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
83	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
84	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
85	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
86	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
87	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
88	mov	synd, vend.d[0]
89	/* Clear the soff*2 lower bits */
90	lsl	tmp, soff, #1
91	lsr	synd, synd, tmp
92	lsl	synd, synd, tmp
93	/* The first block can also be the last */
94	b.ls	L(masklast)
95	/* Have we found something already? */
96	cbnz	synd, L(tail)
97
98L(loop):
99	ld1	{vdata1.16b, vdata2.16b}, [src], #32
100	subs	cntin, cntin, #32
101	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
102	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
103	/* If we're out of data we finish regardless of the result */
104	b.ls	L(end)
105	/* Use a fast check for the termination condition */
106	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
107	addp	vend.2d, vend.2d, vend.2d
108	mov	synd, vend.d[0]
109	/* We're not out of data, loop if we haven't found the character */
110	cbz	synd, L(loop)
111
112L(end):
113	/* Termination condition found, let's calculate the syndrome value */
114	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
115	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
116	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
117	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
118	mov	synd, vend.d[0]
119	/* Only do the clear for the last possible block */
120	b.hs	L(tail)
121
122L(masklast):
123	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
124	add	tmp, cntrem, soff
125	and	tmp, tmp, #31
126	sub	tmp, tmp, #32
127	neg	tmp, tmp, lsl #1
128	lsl	synd, synd, tmp
129	lsr	synd, synd, tmp
130
131L(tail):
132	/* Count the trailing zeros using bit reversing */
133	rbit	synd, synd
134	/* Compensate the last post-increment */
135	sub	src, src, #32
136	/* Check that we have found a character */
137	cmp	synd, #0
138	/* And count the leading zeros */
139	clz	synd, synd
140	/* Compute the potential result */
141	add	result, src, synd, lsr #1
142	/* Select result or NULL */
143	csel	result, xzr, result, eq
144	ret
145
146L(zero_length):
147	mov	result, #0
148	ret
149
150END (memchr)
151#endif
152