1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2012-2022, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, unaligned accesses.
11 *
12 */
13
14#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__)
15/* See memcpy-stub.c  */
16#else
17#include "asmdefs.h"
18
19#define dstin	x0
20#define src	x1
21#define count	x2
22#define dst	x3
23#define srcend	x4
24#define dstend	x5
25#define A_l	x6
26#define A_lw	w6
27#define A_h	x7
28#define B_l	x8
29#define B_lw	w8
30#define B_h	x9
31#define C_l	x10
32#define C_lw	w10
33#define C_h	x11
34#define D_l	x12
35#define D_h	x13
36#define E_l	x14
37#define E_h	x15
38#define F_l	x16
39#define F_h	x17
40#define G_l	count
41#define G_h	dst
42#define H_l	src
43#define H_h	srcend
44#define tmp1	x14
45
46/* This implementation handles overlaps and supports both memcpy and memmove
47   from a single entry point.  It uses unaligned accesses and branchless
48   sequences to keep the code small, simple and improve performance.
49
50   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
51   copies of up to 128 bytes, and large copies.  The overhead of the overlap
52   check is negligible since it is only required for large copies.
53
54   Large copies use a software pipelined loop processing 64 bytes per iteration.
55   The destination pointer is 16-byte aligned to minimize unaligned accesses.
56   The loop tail is handled by always copying 64 bytes from the end.
57*/
58
59ENTRY_ALIAS (memmove)
60ENTRY (memcpy)
61	PTR_ARG (0)
62	PTR_ARG (1)
63	SIZE_ARG (2)
64	add	srcend, src, count
65	add	dstend, dstin, count
66	cmp	count, 128
67	b.hi	L(copy_long)
68	cmp	count, 32
69	b.hi	L(copy32_128)
70
71	/* Small copies: 0..32 bytes.  */
72	cmp	count, 16
73	b.lo	L(copy16)
74	ldp	A_l, A_h, [src]
75	ldp	D_l, D_h, [srcend, -16]
76	stp	A_l, A_h, [dstin]
77	stp	D_l, D_h, [dstend, -16]
78	ret
79
80	/* Copy 8-15 bytes.  */
81L(copy16):
82	tbz	count, 3, L(copy8)
83	ldr	A_l, [src]
84	ldr	A_h, [srcend, -8]
85	str	A_l, [dstin]
86	str	A_h, [dstend, -8]
87	ret
88
89	.p2align 3
90	/* Copy 4-7 bytes.  */
91L(copy8):
92	tbz	count, 2, L(copy4)
93	ldr	A_lw, [src]
94	ldr	B_lw, [srcend, -4]
95	str	A_lw, [dstin]
96	str	B_lw, [dstend, -4]
97	ret
98
99	/* Copy 0..3 bytes using a branchless sequence.  */
100L(copy4):
101	cbz	count, L(copy0)
102	lsr	tmp1, count, 1
103	ldrb	A_lw, [src]
104	ldrb	C_lw, [srcend, -1]
105	ldrb	B_lw, [src, tmp1]
106	strb	A_lw, [dstin]
107	strb	B_lw, [dstin, tmp1]
108	strb	C_lw, [dstend, -1]
109L(copy0):
110	ret
111
112	.p2align 4
113	/* Medium copies: 33..128 bytes.  */
114L(copy32_128):
115	ldp	A_l, A_h, [src]
116	ldp	B_l, B_h, [src, 16]
117	ldp	C_l, C_h, [srcend, -32]
118	ldp	D_l, D_h, [srcend, -16]
119	cmp	count, 64
120	b.hi	L(copy128)
121	stp	A_l, A_h, [dstin]
122	stp	B_l, B_h, [dstin, 16]
123	stp	C_l, C_h, [dstend, -32]
124	stp	D_l, D_h, [dstend, -16]
125	ret
126
127	.p2align 4
128	/* Copy 65..128 bytes.  */
129L(copy128):
130	ldp	E_l, E_h, [src, 32]
131	ldp	F_l, F_h, [src, 48]
132	cmp	count, 96
133	b.ls	L(copy96)
134	ldp	G_l, G_h, [srcend, -64]
135	ldp	H_l, H_h, [srcend, -48]
136	stp	G_l, G_h, [dstend, -64]
137	stp	H_l, H_h, [dstend, -48]
138L(copy96):
139	stp	A_l, A_h, [dstin]
140	stp	B_l, B_h, [dstin, 16]
141	stp	E_l, E_h, [dstin, 32]
142	stp	F_l, F_h, [dstin, 48]
143	stp	C_l, C_h, [dstend, -32]
144	stp	D_l, D_h, [dstend, -16]
145	ret
146
147	.p2align 4
148	/* Copy more than 128 bytes.  */
149L(copy_long):
150	/* Use backwards copy if there is an overlap.  */
151	sub	tmp1, dstin, src
152	cbz	tmp1, L(copy0)
153	cmp	tmp1, count
154	b.lo	L(copy_long_backwards)
155
156	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
157
158	ldp	D_l, D_h, [src]
159	and	tmp1, dstin, 15
160	bic	dst, dstin, 15
161	sub	src, src, tmp1
162	add	count, count, tmp1	/* Count is now 16 too large.  */
163	ldp	A_l, A_h, [src, 16]
164	stp	D_l, D_h, [dstin]
165	ldp	B_l, B_h, [src, 32]
166	ldp	C_l, C_h, [src, 48]
167	ldp	D_l, D_h, [src, 64]!
168	subs	count, count, 128 + 16	/* Test and readjust count.  */
169	b.ls	L(copy64_from_end)
170
171L(loop64):
172	stp	A_l, A_h, [dst, 16]
173	ldp	A_l, A_h, [src, 16]
174	stp	B_l, B_h, [dst, 32]
175	ldp	B_l, B_h, [src, 32]
176	stp	C_l, C_h, [dst, 48]
177	ldp	C_l, C_h, [src, 48]
178	stp	D_l, D_h, [dst, 64]!
179	ldp	D_l, D_h, [src, 64]!
180	subs	count, count, 64
181	b.hi	L(loop64)
182
183	/* Write the last iteration and copy 64 bytes from the end.  */
184L(copy64_from_end):
185	ldp	E_l, E_h, [srcend, -64]
186	stp	A_l, A_h, [dst, 16]
187	ldp	A_l, A_h, [srcend, -48]
188	stp	B_l, B_h, [dst, 32]
189	ldp	B_l, B_h, [srcend, -32]
190	stp	C_l, C_h, [dst, 48]
191	ldp	C_l, C_h, [srcend, -16]
192	stp	D_l, D_h, [dst, 64]
193	stp	E_l, E_h, [dstend, -64]
194	stp	A_l, A_h, [dstend, -48]
195	stp	B_l, B_h, [dstend, -32]
196	stp	C_l, C_h, [dstend, -16]
197	ret
198
199	.p2align 4
200
201	/* Large backwards copy for overlapping copies.
202	   Copy 16 bytes and then align dst to 16-byte alignment.  */
203L(copy_long_backwards):
204	ldp	D_l, D_h, [srcend, -16]
205	and	tmp1, dstend, 15
206	sub	srcend, srcend, tmp1
207	sub	count, count, tmp1
208	ldp	A_l, A_h, [srcend, -16]
209	stp	D_l, D_h, [dstend, -16]
210	ldp	B_l, B_h, [srcend, -32]
211	ldp	C_l, C_h, [srcend, -48]
212	ldp	D_l, D_h, [srcend, -64]!
213	sub	dstend, dstend, tmp1
214	subs	count, count, 128
215	b.ls	L(copy64_from_start)
216
217L(loop64_backwards):
218	stp	A_l, A_h, [dstend, -16]
219	ldp	A_l, A_h, [srcend, -16]
220	stp	B_l, B_h, [dstend, -32]
221	ldp	B_l, B_h, [srcend, -32]
222	stp	C_l, C_h, [dstend, -48]
223	ldp	C_l, C_h, [srcend, -48]
224	stp	D_l, D_h, [dstend, -64]!
225	ldp	D_l, D_h, [srcend, -64]!
226	subs	count, count, 64
227	b.hi	L(loop64_backwards)
228
229	/* Write the last iteration and copy 64 bytes from the start.  */
230L(copy64_from_start):
231	ldp	G_l, G_h, [src, 48]
232	stp	A_l, A_h, [dstend, -16]
233	ldp	A_l, A_h, [src, 32]
234	stp	B_l, B_h, [dstend, -32]
235	ldp	B_l, B_h, [src, 16]
236	stp	C_l, C_h, [dstend, -48]
237	ldp	C_l, C_h, [src]
238	stp	D_l, D_h, [dstend, -64]
239	stp	G_l, G_h, [dstin, 48]
240	stp	A_l, A_h, [dstin, 32]
241	stp	B_l, B_h, [dstin, 16]
242	stp	C_l, C_h, [dstin]
243	ret
244
245END (memcpy)
246#endif
247