1/* Copyright (c) 2012-2013, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/*
28 * Copyright (c) 2015 ARM Ltd
29 * All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 *    notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 *    notice, this list of conditions and the following disclaimer in the
38 *    documentation and/or other materials provided with the distribution.
39 * 3. The name of the company may not be used to endorse or promote
40 *    products derived from this software without specific prior written
41 *    permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55/* Assumptions:
56 *
57 * ARMv8-a, AArch64, unaligned accesses.
58 *
59 */
60
61#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) || !defined(__LP64__)
62/* See memcpy-stub.c  */
63#else
64
65#define dstin	x0
66#define src	x1
67#define count	x2
68#define dst	x3
69#define srcend	x4
70#define dstend	x5
71#define A_l	x6
72#define A_lw	w6
73#define A_h	x7
74#define A_hw	w7
75#define B_l	x8
76#define B_lw	w8
77#define B_h	x9
78#define C_l	x10
79#define C_h	x11
80#define D_l	x12
81#define D_h	x13
82#define E_l	src
83#define E_h	count
84#define F_l	srcend
85#define F_h	dst
86#define tmp1	x9
87
88#define L(l) .L ## l
89
90	.macro def_fn f p2align=0
91	.text
92	.p2align \p2align
93	.global \f
94	.type \f, %function
95\f:
96	.endm
97
98/* Copies are split into 3 main cases: small copies of up to 16 bytes,
99   medium copies of 17..96 bytes which are fully unrolled. Large copies
100   of more than 96 bytes align the destination and use an unrolled loop
101   processing 64 bytes per iteration.
102   Small and medium copies read all data before writing, allowing any
103   kind of overlap, and memmove tailcalls memcpy for these cases as
104   well as non-overlapping copies.
105*/
106
107def_fn memcpy p2align=6
108	prfm	PLDL1KEEP, [src]
109	add	srcend, src, count
110	add	dstend, dstin, count
111	cmp	count, 16
112	b.ls	L(copy16)
113	cmp	count, 96
114	b.hi	L(copy_long)
115
116	/* Medium copies: 17..96 bytes.  */
117	sub	tmp1, count, 1
118	ldp	A_l, A_h, [src]
119	tbnz	tmp1, 6, L(copy96)
120	ldp	D_l, D_h, [srcend, -16]
121	tbz	tmp1, 5, 1f
122	ldp	B_l, B_h, [src, 16]
123	ldp	C_l, C_h, [srcend, -32]
124	stp	B_l, B_h, [dstin, 16]
125	stp	C_l, C_h, [dstend, -32]
1261:
127	stp	A_l, A_h, [dstin]
128	stp	D_l, D_h, [dstend, -16]
129	ret
130
131	.p2align 4
132	/* Small copies: 0..16 bytes.  */
133L(copy16):
134	cmp	count, 8
135	b.lo	1f
136	ldr	A_l, [src]
137	ldr	A_h, [srcend, -8]
138	str	A_l, [dstin]
139	str	A_h, [dstend, -8]
140	ret
141	.p2align 4
1421:
143	tbz	count, 2, 1f
144	ldr	A_lw, [src]
145	ldr	A_hw, [srcend, -4]
146	str	A_lw, [dstin]
147	str	A_hw, [dstend, -4]
148	ret
149
150	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
151	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
1521:
153	cbz	count, 2f
154	lsr	tmp1, count, 1
155	ldrb	A_lw, [src]
156	ldrb	A_hw, [srcend, -1]
157	ldrb	B_lw, [src, tmp1]
158	strb	A_lw, [dstin]
159	strb	B_lw, [dstin, tmp1]
160	strb	A_hw, [dstend, -1]
1612:	ret
162
163	.p2align 4
164	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
165	   32 bytes from the end.  */
166L(copy96):
167	ldp	B_l, B_h, [src, 16]
168	ldp	C_l, C_h, [src, 32]
169	ldp	D_l, D_h, [src, 48]
170	ldp	E_l, E_h, [srcend, -32]
171	ldp	F_l, F_h, [srcend, -16]
172	stp	A_l, A_h, [dstin]
173	stp	B_l, B_h, [dstin, 16]
174	stp	C_l, C_h, [dstin, 32]
175	stp	D_l, D_h, [dstin, 48]
176	stp	E_l, E_h, [dstend, -32]
177	stp	F_l, F_h, [dstend, -16]
178	ret
179
180	/* Align DST to 16 byte alignment so that we don't cross cache line
181	   boundaries on both loads and stores.	 There are at least 96 bytes
182	   to copy, so copy 16 bytes unaligned and then align.	The loop
183	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
184
185	.p2align 4
186L(copy_long):
187	and	tmp1, dstin, 15
188	bic	dst, dstin, 15
189	ldp	D_l, D_h, [src]
190	sub	src, src, tmp1
191	add	count, count, tmp1	/* Count is now 16 too large.  */
192	ldp	A_l, A_h, [src, 16]
193	stp	D_l, D_h, [dstin]
194	ldp	B_l, B_h, [src, 32]
195	ldp	C_l, C_h, [src, 48]
196	ldp	D_l, D_h, [src, 64]!
197	subs	count, count, 128 + 16	/* Test and readjust count.  */
198	b.ls	2f
1991:
200	stp	A_l, A_h, [dst, 16]
201	ldp	A_l, A_h, [src, 16]
202	stp	B_l, B_h, [dst, 32]
203	ldp	B_l, B_h, [src, 32]
204	stp	C_l, C_h, [dst, 48]
205	ldp	C_l, C_h, [src, 48]
206	stp	D_l, D_h, [dst, 64]!
207	ldp	D_l, D_h, [src, 64]!
208	subs	count, count, 64
209	b.hi	1b
210
211	/* Write the last full set of 64 bytes.	 The remainder is at most 64
212	   bytes, so it is safe to always copy 64 bytes from the end even if
213	   there is just 1 byte left.  */
2142:
215	ldp	E_l, E_h, [srcend, -64]
216	stp	A_l, A_h, [dst, 16]
217	ldp	A_l, A_h, [srcend, -48]
218	stp	B_l, B_h, [dst, 32]
219	ldp	B_l, B_h, [srcend, -32]
220	stp	C_l, C_h, [dst, 48]
221	ldp	C_l, C_h, [srcend, -16]
222	stp	D_l, D_h, [dst, 64]
223	stp	E_l, E_h, [dstend, -64]
224	stp	A_l, A_h, [dstend, -48]
225	stp	B_l, B_h, [dstend, -32]
226	stp	C_l, C_h, [dstend, -16]
227	ret
228
229	.size	memcpy, . - memcpy
230#endif
231