1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2012-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, unaligned accesses.
11 *
12 */
13
14#include <picolibc.h>
15
16#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__)
17/* See memcpy-stub.c  */
18#else
19#include "asmdefs.h"
20
21#define dstin	x0
22#define src	x1
23#define count	x2
24#define dst	x3
25#define srcend	x4
26#define dstend	x5
27#define A_l	x6
28#define A_lw	w6
29#define A_h	x7
30#define B_l	x8
31#define B_lw	w8
32#define B_h	x9
33#define C_l	x10
34#define C_lw	w10
35#define C_h	x11
36#define D_l	x12
37#define D_h	x13
38#define E_l	x14
39#define E_h	x15
40#define F_l	x16
41#define F_h	x17
42#define G_l	count
43#define G_h	dst
44#define H_l	src
45#define H_h	srcend
46#define tmp1	x14
47
48/* This implementation handles overlaps and supports both memcpy and memmove
49   from a single entry point.  It uses unaligned accesses and branchless
50   sequences to keep the code small, simple and improve performance.
51
52   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
53   copies of up to 128 bytes, and large copies.  The overhead of the overlap
54   check is negligible since it is only required for large copies.
55
56   Large copies use a software pipelined loop processing 64 bytes per iteration.
57   The destination pointer is 16-byte aligned to minimize unaligned accesses.
58   The loop tail is handled by always copying 64 bytes from the end.
59*/
60
61ENTRY_ALIAS (memmove)
62ENTRY (memcpy)
63	PTR_ARG (0)
64	PTR_ARG (1)
65	SIZE_ARG (2)
66	add	srcend, src, count
67	add	dstend, dstin, count
68	cmp	count, 128
69	b.hi	L(copy_long)
70	cmp	count, 32
71	b.hi	L(copy32_128)
72
73	/* Small copies: 0..32 bytes.  */
74	cmp	count, 16
75	b.lo	L(copy16)
76	ldp	A_l, A_h, [src]
77	ldp	D_l, D_h, [srcend, -16]
78	stp	A_l, A_h, [dstin]
79	stp	D_l, D_h, [dstend, -16]
80	ret
81
82	/* Copy 8-15 bytes.  */
83L(copy16):
84	tbz	count, 3, L(copy8)
85	ldr	A_l, [src]
86	ldr	A_h, [srcend, -8]
87	str	A_l, [dstin]
88	str	A_h, [dstend, -8]
89	ret
90
91	.p2align 3
92	/* Copy 4-7 bytes.  */
93L(copy8):
94	tbz	count, 2, L(copy4)
95	ldr	A_lw, [src]
96	ldr	B_lw, [srcend, -4]
97	str	A_lw, [dstin]
98	str	B_lw, [dstend, -4]
99	ret
100
101	/* Copy 0..3 bytes using a branchless sequence.  */
102L(copy4):
103	cbz	count, L(copy0)
104	lsr	tmp1, count, 1
105	ldrb	A_lw, [src]
106	ldrb	C_lw, [srcend, -1]
107	ldrb	B_lw, [src, tmp1]
108	strb	A_lw, [dstin]
109	strb	B_lw, [dstin, tmp1]
110	strb	C_lw, [dstend, -1]
111L(copy0):
112	ret
113
114	.p2align 4
115	/* Medium copies: 33..128 bytes.  */
116L(copy32_128):
117	ldp	A_l, A_h, [src]
118	ldp	B_l, B_h, [src, 16]
119	ldp	C_l, C_h, [srcend, -32]
120	ldp	D_l, D_h, [srcend, -16]
121	cmp	count, 64
122	b.hi	L(copy128)
123	stp	A_l, A_h, [dstin]
124	stp	B_l, B_h, [dstin, 16]
125	stp	C_l, C_h, [dstend, -32]
126	stp	D_l, D_h, [dstend, -16]
127	ret
128
129	.p2align 4
130	/* Copy 65..128 bytes.  */
131L(copy128):
132	ldp	E_l, E_h, [src, 32]
133	ldp	F_l, F_h, [src, 48]
134	cmp	count, 96
135	b.ls	L(copy96)
136	ldp	G_l, G_h, [srcend, -64]
137	ldp	H_l, H_h, [srcend, -48]
138	stp	G_l, G_h, [dstend, -64]
139	stp	H_l, H_h, [dstend, -48]
140L(copy96):
141	stp	A_l, A_h, [dstin]
142	stp	B_l, B_h, [dstin, 16]
143	stp	E_l, E_h, [dstin, 32]
144	stp	F_l, F_h, [dstin, 48]
145	stp	C_l, C_h, [dstend, -32]
146	stp	D_l, D_h, [dstend, -16]
147	ret
148
149	.p2align 4
150	/* Copy more than 128 bytes.  */
151L(copy_long):
152	/* Use backwards copy if there is an overlap.  */
153	sub	tmp1, dstin, src
154	cbz	tmp1, L(copy0)
155	cmp	tmp1, count
156	b.lo	L(copy_long_backwards)
157
158	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
159
160	ldp	D_l, D_h, [src]
161	and	tmp1, dstin, 15
162	bic	dst, dstin, 15
163	sub	src, src, tmp1
164	add	count, count, tmp1	/* Count is now 16 too large.  */
165	ldp	A_l, A_h, [src, 16]
166	stp	D_l, D_h, [dstin]
167	ldp	B_l, B_h, [src, 32]
168	ldp	C_l, C_h, [src, 48]
169	ldp	D_l, D_h, [src, 64]!
170	subs	count, count, 128 + 16	/* Test and readjust count.  */
171	b.ls	L(copy64_from_end)
172
173L(loop64):
174	stp	A_l, A_h, [dst, 16]
175	ldp	A_l, A_h, [src, 16]
176	stp	B_l, B_h, [dst, 32]
177	ldp	B_l, B_h, [src, 32]
178	stp	C_l, C_h, [dst, 48]
179	ldp	C_l, C_h, [src, 48]
180	stp	D_l, D_h, [dst, 64]!
181	ldp	D_l, D_h, [src, 64]!
182	subs	count, count, 64
183	b.hi	L(loop64)
184
185	/* Write the last iteration and copy 64 bytes from the end.  */
186L(copy64_from_end):
187	ldp	E_l, E_h, [srcend, -64]
188	stp	A_l, A_h, [dst, 16]
189	ldp	A_l, A_h, [srcend, -48]
190	stp	B_l, B_h, [dst, 32]
191	ldp	B_l, B_h, [srcend, -32]
192	stp	C_l, C_h, [dst, 48]
193	ldp	C_l, C_h, [srcend, -16]
194	stp	D_l, D_h, [dst, 64]
195	stp	E_l, E_h, [dstend, -64]
196	stp	A_l, A_h, [dstend, -48]
197	stp	B_l, B_h, [dstend, -32]
198	stp	C_l, C_h, [dstend, -16]
199	ret
200
201	.p2align 4
202
203	/* Large backwards copy for overlapping copies.
204	   Copy 16 bytes and then align dst to 16-byte alignment.  */
205L(copy_long_backwards):
206	ldp	D_l, D_h, [srcend, -16]
207	and	tmp1, dstend, 15
208	sub	srcend, srcend, tmp1
209	sub	count, count, tmp1
210	ldp	A_l, A_h, [srcend, -16]
211	stp	D_l, D_h, [dstend, -16]
212	ldp	B_l, B_h, [srcend, -32]
213	ldp	C_l, C_h, [srcend, -48]
214	ldp	D_l, D_h, [srcend, -64]!
215	sub	dstend, dstend, tmp1
216	subs	count, count, 128
217	b.ls	L(copy64_from_start)
218
219L(loop64_backwards):
220	stp	A_l, A_h, [dstend, -16]
221	ldp	A_l, A_h, [srcend, -16]
222	stp	B_l, B_h, [dstend, -32]
223	ldp	B_l, B_h, [srcend, -32]
224	stp	C_l, C_h, [dstend, -48]
225	ldp	C_l, C_h, [srcend, -48]
226	stp	D_l, D_h, [dstend, -64]!
227	ldp	D_l, D_h, [srcend, -64]!
228	subs	count, count, 64
229	b.hi	L(loop64_backwards)
230
231	/* Write the last iteration and copy 64 bytes from the start.  */
232L(copy64_from_start):
233	ldp	G_l, G_h, [src, 48]
234	stp	A_l, A_h, [dstend, -16]
235	ldp	A_l, A_h, [src, 32]
236	stp	B_l, B_h, [dstend, -32]
237	ldp	B_l, B_h, [src, 16]
238	stp	C_l, C_h, [dstend, -48]
239	ldp	C_l, C_h, [src]
240	stp	D_l, D_h, [dstend, -64]
241	stp	G_l, G_h, [dstin, 48]
242	stp	A_l, A_h, [dstin, 32]
243	stp	B_l, B_h, [dstin, 16]
244	stp	C_l, C_h, [dstin]
245	ret
246
247END (memcpy)
248#endif
249