1/*
2   strcpy/stpcpy - copy a string returning pointer to start/end.
3
4   Copyright (c) 2013, 2014, 2015, 2020-2023 ARM Ltd.
5   All Rights Reserved.
6
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions are met:
9       * Redistributions of source code must retain the above copyright
10         notice, this list of conditions and the following disclaimer.
11       * Redistributions in binary form must reproduce the above copyright
12         notice, this list of conditions and the following disclaimer in the
13         documentation and/or other materials provided with the distribution.
14       * Neither the name of the company nor the names of its contributors
15         may be used to endorse or promote products derived from this
16         software without specific prior written permission.
17
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */
29#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
30/* See strcpy-stub.c  */
31#else
32
33/* Assumptions:
34 *
35 * ARMv8-a, AArch64, Advanced SIMD.
36 * MTE compatible.
37 */
38
39#include "asmdefs.h"
40
41#define dstin		x0
42#define srcin		x1
43#define result		x0
44
45#define src		x2
46#define dst		x3
47#define len		x4
48#define synd		x4
49#define	tmp		x5
50#define shift		x5
51#define data1		x6
52#define dataw1		w6
53#define data2		x7
54#define dataw2		w7
55
56#define dataq		q0
57#define vdata		v0
58#define vhas_nul	v1
59#define vend		v2
60#define dend		d2
61#define dataq2		q1
62
63#ifdef BUILD_STPCPY
64# define STRCPY stpcpy
65# define IFSTPCPY(X,...) X,__VA_ARGS__
66#else
67# define STRCPY strcpy
68# define IFSTPCPY(X,...)
69#endif
70
71/*
72   Core algorithm:
73   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
74   per byte. We take 4 bits of every comparison byte with shift right and narrow
75   by 4 instruction. Since the bits in the nibble mask reflect the order in
76   which things occur in the original string, counting leading zeros identifies
77   exactly which byte matched.  */
78
79ENTRY (STRCPY)
80	PTR_ARG (0)
81	PTR_ARG (1)
82	bic	src, srcin, 15
83	ld1	{vdata.16b}, [src]
84	cmeq	vhas_nul.16b, vdata.16b, 0
85	lsl	shift, srcin, 2
86	shrn	vend.8b, vhas_nul.8h, 4
87	fmov	synd, dend
88	lsr	synd, synd, shift
89	cbnz	synd, L(tail)
90
91	ldr	dataq, [src, 16]!
92	cmeq	vhas_nul.16b, vdata.16b, 0
93	shrn	vend.8b, vhas_nul.8h, 4
94	fmov	synd, dend
95	cbz	synd, L(start_loop)
96
97#ifndef __AARCH64EB__
98	rbit	synd, synd
99#endif
100	sub	tmp, src, srcin
101	clz	len, synd
102	add	len, tmp, len, lsr 2
103	tbz	len, 4, L(less16)
104	sub	tmp, len, 15
105	ldr	dataq, [srcin]
106	ldr	dataq2, [srcin, tmp]
107	str	dataq, [dstin]
108	str	dataq2, [dstin, tmp]
109	IFSTPCPY (add result, dstin, len)
110	ret
111
112L(tail):
113	rbit	synd, synd
114	clz	len, synd
115	lsr	len, len, 2
116L(less16):
117	tbz	len, 3, L(less8)
118	sub	tmp, len, 7
119	ldr	data1, [srcin]
120	ldr	data2, [srcin, tmp]
121	str	data1, [dstin]
122	str	data2, [dstin, tmp]
123	IFSTPCPY (add result, dstin, len)
124	ret
125
126	.p2align 4
127L(less8):
128	subs	tmp, len, 3
129	b.lo	L(less4)
130	ldr	dataw1, [srcin]
131	ldr	dataw2, [srcin, tmp]
132	str	dataw1, [dstin]
133	str	dataw2, [dstin, tmp]
134	IFSTPCPY (add result, dstin, len)
135	ret
136
137L(less4):
138	cbz	len, L(zerobyte)
139	ldrh	dataw1, [srcin]
140	strh	dataw1, [dstin]
141L(zerobyte):
142	strb	wzr, [dstin, len]
143	IFSTPCPY (add result, dstin, len)
144	ret
145
146	.p2align 4
147L(start_loop):
148	sub	tmp, srcin, dstin
149	ldr	dataq2, [srcin]
150	sub	dst, src, tmp
151	str	dataq2, [dstin]
152L(loop):
153	str	dataq, [dst], 32
154	ldr	dataq, [src, 16]
155	cmeq	vhas_nul.16b, vdata.16b, 0
156	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
157	fmov	synd, dend
158	cbnz	synd, L(loopend)
159	str	dataq, [dst, -16]
160	ldr	dataq, [src, 32]!
161	cmeq	vhas_nul.16b, vdata.16b, 0
162	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
163	fmov	synd, dend
164	cbz	synd, L(loop)
165	add	dst, dst, 16
166L(loopend):
167	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
168	fmov	synd, dend
169	sub	dst, dst, 31
170#ifndef __AARCH64EB__
171	rbit	synd, synd
172#endif
173	clz	len, synd
174	lsr	len, len, 2
175	add	dst, dst, len
176	ldr	dataq, [dst, tmp]
177	str	dataq, [dst]
178	IFSTPCPY (add result, dst, 15)
179	ret
180
181END (STRCPY)
182#endif
183