1/*
2 * Copyright (c) 2013 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
30   unaligned access.
31
32   If compiled with GCC, this file should be enclosed within following
33   pre-processing check:
34   if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
35
36   Prototype: void *memcpy (void *dst, const void *src, size_t count);
37
38   The job will be done in 5 steps.
39   Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
40   Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
41   Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
42   Step 4: Copy word by word
43   Step 5: Copy byte-to-byte
44
45   Tunable options:
46     __OPT_BIG_BLOCK_SIZE: Size of big block in words.  Default to 64.
47     __OPT_MID_BLOCK_SIZE: Size of big block in words.  Default to 16.
48 */
49#include "arm_asm.h"
50
51#ifndef __OPT_BIG_BLOCK_SIZE
52#define __OPT_BIG_BLOCK_SIZE (4 * 16)
53#endif
54
55#ifndef __OPT_MID_BLOCK_SIZE
56#define __OPT_MID_BLOCK_SIZE (4 * 4)
57#endif
58
59#if __OPT_BIG_BLOCK_SIZE == 16
60#define BEGIN_UNROLL_BIG_BLOCK \
61  .irp offset, 0,4,8,12
62#elif __OPT_BIG_BLOCK_SIZE == 32
63#define BEGIN_UNROLL_BIG_BLOCK \
64  .irp offset, 0,4,8,12,16,20,24,28
65#elif __OPT_BIG_BLOCK_SIZE == 64
66#define BEGIN_UNROLL_BIG_BLOCK \
67  .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
68#else
69#error "Illegal __OPT_BIG_BLOCK_SIZE"
70#endif
71
72#if __OPT_MID_BLOCK_SIZE == 8
73#define BEGIN_UNROLL_MID_BLOCK \
74  .irp offset, 0,4
75#elif __OPT_MID_BLOCK_SIZE == 16
76#define BEGIN_UNROLL_MID_BLOCK \
77  .irp offset, 0,4,8,12
78#else
79#error "Illegal __OPT_MID_BLOCK_SIZE"
80#endif
81
82#define END_UNROLL .endr
83
84.macro	ASM_ALIAS new old
85	.global	\new
86	.type	\new, %function
87#if defined (__thumb__)
88	.thumb_set	\new, \old
89#else
90	.set	\new, \old
91#endif
92.endm
93
94	.syntax unified
95	.text
96	.align	2
97	.global	memcpy
98	.thumb
99	.thumb_func
100	.fnstart
101	.cfi_startproc
102	.type	memcpy, %function
103	ASM_ALIAS __aeabi_memcpy, memcpy
104	ASM_ALIAS __aeabi_memcpy4, memcpy
105	ASM_ALIAS __aeabi_memcpy8, memcpy
106memcpy:
107	@ r0: dst
108	@ r1: src
109	@ r2: len
110#ifdef __ARM_FEATURE_UNALIGNED
111	/* In case of UNALIGNED access supported, ip is not used in
112	   function body.  */
113	prologue push_ip=HAVE_PAC_LEAF
114	mov	ip, r0
115#else
116	prologue 0 push_ip=HAVE_PAC_LEAF
117#endif /* __ARM_FEATURE_UNALIGNED */
118	orr	r3, r1, r0
119	ands	r3, r3, #3
120	bne	.Lmisaligned_copy
121
122.Lbig_block:
123	subs	r2, __OPT_BIG_BLOCK_SIZE
124	blo	.Lmid_block
125
126	/* Kernel loop for big block copy */
127	.align 2
128.Lbig_block_loop:
129	BEGIN_UNROLL_BIG_BLOCK
130#ifdef __ARM_ARCH_7EM__
131	ldr	r3, [r1], #4
132	str	r3, [r0], #4
133	END_UNROLL
134#else /* __ARM_ARCH_7M__ */
135	ldr	r3, [r1, \offset]
136	str	r3, [r0, \offset]
137	END_UNROLL
138	adds	r0, __OPT_BIG_BLOCK_SIZE
139	adds	r1, __OPT_BIG_BLOCK_SIZE
140#endif
141	subs	r2, __OPT_BIG_BLOCK_SIZE
142	bhs .Lbig_block_loop
143
144.Lmid_block:
145	adds	r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
146	blo	.Lcopy_word_by_word
147
148	/* Kernel loop for mid-block copy */
149	.align 2
150.Lmid_block_loop:
151	BEGIN_UNROLL_MID_BLOCK
152#ifdef __ARM_ARCH_7EM__
153	ldr	r3, [r1], #4
154	str	r3, [r0], #4
155	END_UNROLL
156#else /* __ARM_ARCH_7M__ */
157	ldr	r3, [r1, \offset]
158	str	r3, [r0, \offset]
159	END_UNROLL
160	adds    r0, __OPT_MID_BLOCK_SIZE
161	adds    r1, __OPT_MID_BLOCK_SIZE
162#endif
163	subs	r2, __OPT_MID_BLOCK_SIZE
164	bhs	.Lmid_block_loop
165
166.Lcopy_word_by_word:
167	adds	r2, __OPT_MID_BLOCK_SIZE - 4
168	blo	.Lcopy_less_than_4
169
170	/* Kernel loop for small block copy */
171	.align 2
172.Lcopy_word_by_word_loop:
173	ldr	r3, [r1], #4
174	str	r3, [r0], #4
175	subs	r2, #4
176	bhs	.Lcopy_word_by_word_loop
177
178.Lcopy_less_than_4:
179	adds	r2, #4
180	beq	.Ldone
181
182	lsls	r2, r2, #31
183	itt ne
184	ldrbne  r3, [r1], #1
185	strbne  r3, [r0], #1
186
187	bcc	.Ldone
188#ifdef __ARM_FEATURE_UNALIGNED
189	ldrh	r3, [r1]
190	strh	r3, [r0]
191#else
192	ldrb	r3, [r1]
193	strb	r3, [r0]
194	ldrb	r3, [r1, #1]
195	strb	r3, [r0, #1]
196#endif /* __ARM_FEATURE_UNALIGNED */
197
198.Ldone:
199	.cfi_remember_state
200#ifdef __ARM_FEATURE_UNALIGNED
201	mov	r0, ip
202	epilogue push_ip=HAVE_PAC_LEAF
203#else
204	epilogue 0 push_ip=HAVE_PAC_LEAF
205#endif /*  __ARM_FEATURE_UNALIGNED */
206
207	.align 2
208.Lmisaligned_copy:
209	.cfi_restore_state
210#ifdef __ARM_FEATURE_UNALIGNED
211	/* Define label DST_ALIGNED to BIG_BLOCK.  It will go to aligned copy
212	   once destination is adjusted to aligned.  */
213#define Ldst_aligned Lbig_block
214
215	/* Copy word by word using LDR when alignment can be done in hardware,
216	i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
217
218	cmp	r2, #8
219	blo	.Lbyte_copy
220
221	/* if src is aligned, just go to the big block loop.  */
222	lsls	r3, r1, #30
223	beq	.Ldst_aligned
224#else
225	/* if len < 12, misalignment adjustment has more overhead than
226	just byte-to-byte copy.  Also, len must >=8 to guarantee code
227	afterward work correctly.  */
228	cmp	r2, #12
229	blo	.Lbyte_copy
230#endif /* __ARM_FEATURE_UNALIGNED */
231
232	/* Align dst only, not trying to align src.  That is the because
233	handling of aligned src and misaligned dst need more overhead than
234	otherwise.  By doing this the worst case is when initial src is aligned,
235	additional up to 4 byte additional copy will executed, which is
236	acceptable.  */
237
238	ands	r3, r0, #3
239	beq	.Ldst_aligned
240
241	rsb	r3, #4
242	subs	r2, r3
243
244	lsls    r3, r3, #31
245	itt ne
246	ldrbne  r3, [r1], #1
247	strbne  r3, [r0], #1
248
249	bcc .Ldst_aligned
250
251#ifdef __ARM_FEATURE_UNALIGNED
252	ldrh    r3, [r1], #2
253	strh    r3, [r0], #2
254	b	.Ldst_aligned
255#else
256	ldrb    r3, [r1], #1
257	strb    r3, [r0], #1
258	ldrb    r3, [r1], #1
259	strb    r3, [r0], #1
260	/* Now that dst is aligned */
261.Ldst_aligned:
262	/* if r1 is aligned now, it means r0/r1 has the same misalignment,
263	and they are both aligned now.  Go aligned copy.  */
264	ands	r3, r1, #3
265	beq	.Lbig_block
266
267	/* dst is aligned, but src isn't.  Misaligned copy.  */
268
269	push	{r4, r5}
270	.cfi_adjust_cfa_offset 8
271	.cfi_rel_offset 4, 0
272	.cfi_rel_offset 5, 4
273	subs	r2, #4
274
275	/* Backward r1 by misaligned bytes, to make r1 aligned.
276	Since we need to restore r1 to unaligned address after the loop,
277	we need keep the offset bytes to ip and sub it from r1 afterward.  */
278	subs	r1, r3
279	rsb	ip, r3, #4
280
281	/* Pre-load on word */
282	ldr	r4, [r1], #4
283
284	cmp	r3, #2
285	beq	.Lmisaligned_copy_2_2
286	cmp	r3, #3
287	beq	.Lmisaligned_copy_3_1
288
289	.macro mis_src_copy shift
2901:
291#ifdef __ARM_BIG_ENDIAN
292	lsls	r4, r4, \shift
293#else
294	lsrs	r4, r4, \shift
295#endif
296	ldr	r3, [r1], #4
297#ifdef __ARM_BIG_ENDIAN
298	lsrs	r5, r3, 32-\shift
299#else
300	lsls	r5, r3, 32-\shift
301#endif
302	orr	r4, r4, r5
303	str	r4, [r0], #4
304	mov	r4, r3
305	subs	r2, #4
306	bhs	1b
307	.endm
308
309.Lmisaligned_copy_1_3:
310	mis_src_copy shift=8
311	b	.Lsrc_misaligned_tail
312
313.Lmisaligned_copy_3_1:
314	mis_src_copy shift=24
315	b	.Lsrc_misaligned_tail
316
317.Lmisaligned_copy_2_2:
318	/* For 2_2 misalignment, ldr is still faster than 2 x ldrh.  */
319	mis_src_copy shift=16
320
321.Lsrc_misaligned_tail:
322	adds	r2, #4
323	subs	r1, ip
324	pop	{r4, r5}
325	.cfi_restore 4
326	.cfi_restore 5
327	.cfi_adjust_cfa_offset -8
328
329#endif /* __ARM_FEATURE_UNALIGNED */
330
331.Lbyte_copy:
332	subs	r2, #4
333	blo	.Lcopy_less_than_4
334
335.Lbyte_copy_loop:
336	subs    r2, #1
337	ldrb    r3, [r1], #1
338	strb    r3, [r0], #1
339	bhs	.Lbyte_copy_loop
340
341	ldrb	r3, [r1]
342	strb	r3, [r0]
343	ldrb	r3, [r1, #1]
344	strb	r3, [r0, #1]
345	ldrb	r3, [r1, #2]
346	strb	r3, [r0, #2]
347
348#ifdef __ARM_FEATURE_UNALIGNED
349	mov	r0, ip
350	epilogue push_ip=HAVE_PAC_LEAF
351#else
352	epilogue 0 push_ip=HAVE_PAC_LEAF
353#endif /* __ARM_FEATURE_UNALIGNED */
354	.cfi_endproc
355	.cantunwind
356	.fnend
357	.size	memcpy, .-memcpy
358