1/*
2   strcpy/stpcpy - copy a string returning pointer to start/end.
3
4   Copyright (c) 2013, 2014, 2015, 2020-2023 ARM Ltd.
5   All Rights Reserved.
6
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions are met:
9       * Redistributions of source code must retain the above copyright
10         notice, this list of conditions and the following disclaimer.
11       * Redistributions in binary form must reproduce the above copyright
12         notice, this list of conditions and the following disclaimer in the
13         documentation and/or other materials provided with the distribution.
14       * Neither the name of the company nor the names of its contributors
15         may be used to endorse or promote products derived from this
16         software without specific prior written permission.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
29#include <picolibc.h>
30
31#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__) || !defined(__ARM_NEON)
32/* See strcpy-stub.c  */
33#else
34
35/* Assumptions:
36 *
37 * ARMv8-a, AArch64, Advanced SIMD.
38 * MTE compatible.
39 */
40
41#include "asmdefs.h"
42
43#define dstin		x0
44#define srcin		x1
45#define result		x0
46
47#define src		x2
48#define dst		x3
49#define len		x4
50#define synd		x4
51#define	tmp		x5
52#define shift		x5
53#define data1		x6
54#define dataw1		w6
55#define data2		x7
56#define dataw2		w7
57
58#define dataq		q0
59#define vdata		v0
60#define vhas_nul	v1
61#define vend		v2
62#define dend		d2
63#define dataq2		q1
64
65#ifdef BUILD_STPCPY
66# define STRCPY stpcpy
67# define IFSTPCPY(X,...) X,__VA_ARGS__
68#else
69# define STRCPY strcpy
70# define IFSTPCPY(X,...)
71#endif
72
73/*
74   Core algorithm:
75   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
76   per byte. We take 4 bits of every comparison byte with shift right and narrow
77   by 4 instruction. Since the bits in the nibble mask reflect the order in
78   which things occur in the original string, counting leading zeros identifies
79   exactly which byte matched.  */
80
81ENTRY (STRCPY)
82	PTR_ARG (0)
83	PTR_ARG (1)
84	bic	src, srcin, 15
85	ld1	{vdata.16b}, [src]
86	cmeq	vhas_nul.16b, vdata.16b, 0
87	lsl	shift, srcin, 2
88	shrn	vend.8b, vhas_nul.8h, 4
89	fmov	synd, dend
90	lsr	synd, synd, shift
91	cbnz	synd, L(tail)
92
93	ldr	dataq, [src, 16]!
94	cmeq	vhas_nul.16b, vdata.16b, 0
95	shrn	vend.8b, vhas_nul.8h, 4
96	fmov	synd, dend
97	cbz	synd, L(start_loop)
98
99#ifndef __AARCH64EB__
100	rbit	synd, synd
101#endif
102	sub	tmp, src, srcin
103	clz	len, synd
104	add	len, tmp, len, lsr 2
105	tbz	len, 4, L(less16)
106	sub	tmp, len, 15
107	ldr	dataq, [srcin]
108	ldr	dataq2, [srcin, tmp]
109	str	dataq, [dstin]
110	str	dataq2, [dstin, tmp]
111	IFSTPCPY (add result, dstin, len)
112	ret
113
114L(tail):
115	rbit	synd, synd
116	clz	len, synd
117	lsr	len, len, 2
118L(less16):
119	tbz	len, 3, L(less8)
120	sub	tmp, len, 7
121	ldr	data1, [srcin]
122	ldr	data2, [srcin, tmp]
123	str	data1, [dstin]
124	str	data2, [dstin, tmp]
125	IFSTPCPY (add result, dstin, len)
126	ret
127
128	.p2align 4
129L(less8):
130	subs	tmp, len, 3
131	b.lo	L(less4)
132	ldr	dataw1, [srcin]
133	ldr	dataw2, [srcin, tmp]
134	str	dataw1, [dstin]
135	str	dataw2, [dstin, tmp]
136	IFSTPCPY (add result, dstin, len)
137	ret
138
139L(less4):
140	cbz	len, L(zerobyte)
141	ldrh	dataw1, [srcin]
142	strh	dataw1, [dstin]
143L(zerobyte):
144	strb	wzr, [dstin, len]
145	IFSTPCPY (add result, dstin, len)
146	ret
147
148	.p2align 4
149L(start_loop):
150	sub	tmp, srcin, dstin
151	ldr	dataq2, [srcin]
152	sub	dst, src, tmp
153	str	dataq2, [dstin]
154L(loop):
155	str	dataq, [dst], 32
156	ldr	dataq, [src, 16]
157	cmeq	vhas_nul.16b, vdata.16b, 0
158	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
159	fmov	synd, dend
160	cbnz	synd, L(loopend)
161	str	dataq, [dst, -16]
162	ldr	dataq, [src, 32]!
163	cmeq	vhas_nul.16b, vdata.16b, 0
164	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
165	fmov	synd, dend
166	cbz	synd, L(loop)
167	add	dst, dst, 16
168L(loopend):
169	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
170	fmov	synd, dend
171	sub	dst, dst, 31
172#ifndef __AARCH64EB__
173	rbit	synd, synd
174#endif
175	clz	len, synd
176	lsr	len, len, 2
177	add	dst, dst, len
178	ldr	dataq, [dst, tmp]
179	str	dataq, [dst]
180	IFSTPCPY (add result, dst, 15)
181	ret
182
183END (STRCPY)
184#endif
185