1/*
2 * Copyright (c) 2013 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
30   unaligned access.
31
32   If compiled with GCC, this file should be enclosed within following
33   pre-processing check:
34   if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
35
36   Prototype: void *memcpy (void *dst, const void *src, size_t count);
37
38   The job will be done in 5 steps.
39   Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
40   Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
41   Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
42   Step 4: Copy word by word
43   Step 5: Copy byte-to-byte
44
45   Tunable options:
46     __OPT_BIG_BLOCK_SIZE: Size of big block in words.  Default to 64.
47     __OPT_MID_BLOCK_SIZE: Size of big block in words.  Default to 16.
48 */
49#include <picolibc.h>
50
51#include "arm_asm.h"
52
53#ifndef __OPT_BIG_BLOCK_SIZE
54#define __OPT_BIG_BLOCK_SIZE (4 * 16)
55#endif
56
57#ifndef __OPT_MID_BLOCK_SIZE
58#define __OPT_MID_BLOCK_SIZE (4 * 4)
59#endif
60
61#if __OPT_BIG_BLOCK_SIZE == 16
62#define BEGIN_UNROLL_BIG_BLOCK \
63  .irp offset, 0,4,8,12
64#elif __OPT_BIG_BLOCK_SIZE == 32
65#define BEGIN_UNROLL_BIG_BLOCK \
66  .irp offset, 0,4,8,12,16,20,24,28
67#elif __OPT_BIG_BLOCK_SIZE == 64
68#define BEGIN_UNROLL_BIG_BLOCK \
69  .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
70#else
71#error "Illegal __OPT_BIG_BLOCK_SIZE"
72#endif
73
74#if __OPT_MID_BLOCK_SIZE == 8
75#define BEGIN_UNROLL_MID_BLOCK \
76  .irp offset, 0,4
77#elif __OPT_MID_BLOCK_SIZE == 16
78#define BEGIN_UNROLL_MID_BLOCK \
79  .irp offset, 0,4,8,12
80#else
81#error "Illegal __OPT_MID_BLOCK_SIZE"
82#endif
83
84#define END_UNROLL .endr
85
86	.syntax unified
87	.text
88	.align	2
89	.global	memcpy
90	.thumb
91	.thumb_func
92	.fnstart
93	.cfi_sections .debug_frame
94	.cfi_startproc
95	.type	memcpy, %function
96	ASM_ALIAS __aeabi_memcpy, memcpy
97	ASM_ALIAS __aeabi_memcpy4, memcpy
98	ASM_ALIAS __aeabi_memcpy8, memcpy
99memcpy:
100	// r0: dst
101	// r1: src
102	// r2: len
103#ifdef __ARM_FEATURE_UNALIGNED
104	/* In case of UNALIGNED access supported, ip is not used in
105	   function body.  */
106	prologue push_ip=HAVE_PAC_LEAF
107	mov	ip, r0
108#else
109	prologue 0 push_ip=HAVE_PAC_LEAF
110#endif /* __ARM_FEATURE_UNALIGNED */
111	orr	r3, r1, r0
112	ands	r3, r3, #3
113	bne	.Lmisaligned_copy
114
115.Lbig_block:
116	subs	r2, __OPT_BIG_BLOCK_SIZE
117	blo	.Lmid_block
118
119	/* Kernel loop for big block copy */
120	.align 2
121.Lbig_block_loop:
122	BEGIN_UNROLL_BIG_BLOCK
123#ifdef __ARM_ARCH_7EM__
124	ldr	r3, [r1], #4
125	str	r3, [r0], #4
126	END_UNROLL
127#else /* __ARM_ARCH_7M__ */
128	ldr	r3, [r1, \offset]
129	str	r3, [r0, \offset]
130	END_UNROLL
131	adds	r0, __OPT_BIG_BLOCK_SIZE
132	adds	r1, __OPT_BIG_BLOCK_SIZE
133#endif
134	subs	r2, __OPT_BIG_BLOCK_SIZE
135	bhs .Lbig_block_loop
136
137.Lmid_block:
138	adds	r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
139	blo	.Lcopy_word_by_word
140
141	/* Kernel loop for mid-block copy */
142	.align 2
143.Lmid_block_loop:
144	BEGIN_UNROLL_MID_BLOCK
145#ifdef __ARM_ARCH_7EM__
146	ldr	r3, [r1], #4
147	str	r3, [r0], #4
148	END_UNROLL
149#else /* __ARM_ARCH_7M__ */
150	ldr	r3, [r1, \offset]
151	str	r3, [r0, \offset]
152	END_UNROLL
153	adds    r0, __OPT_MID_BLOCK_SIZE
154	adds    r1, __OPT_MID_BLOCK_SIZE
155#endif
156	subs	r2, __OPT_MID_BLOCK_SIZE
157	bhs	.Lmid_block_loop
158
159.Lcopy_word_by_word:
160	adds	r2, __OPT_MID_BLOCK_SIZE - 4
161	blo	.Lcopy_less_than_4
162
163	/* Kernel loop for small block copy */
164	.align 2
165.Lcopy_word_by_word_loop:
166	ldr	r3, [r1], #4
167	str	r3, [r0], #4
168	subs	r2, #4
169	bhs	.Lcopy_word_by_word_loop
170
171.Lcopy_less_than_4:
172	adds	r2, #4
173	beq	.Ldone
174
175	lsls	r2, r2, #31
176	itt ne
177	ldrbne  r3, [r1], #1
178	strbne  r3, [r0], #1
179
180	bcc	.Ldone
181#ifdef __ARM_FEATURE_UNALIGNED
182	ldrh	r3, [r1]
183	strh	r3, [r0]
184#else
185	ldrb	r3, [r1]
186	strb	r3, [r0]
187	ldrb	r3, [r1, #1]
188	strb	r3, [r0, #1]
189#endif /* __ARM_FEATURE_UNALIGNED */
190
191.Ldone:
192	.cfi_remember_state
193#ifdef __ARM_FEATURE_UNALIGNED
194	mov	r0, ip
195	epilogue push_ip=HAVE_PAC_LEAF
196#else
197	epilogue 0 push_ip=HAVE_PAC_LEAF
198#endif /*  __ARM_FEATURE_UNALIGNED */
199
200	.align 2
201.Lmisaligned_copy:
202	.cfi_restore_state
203#ifdef __ARM_FEATURE_UNALIGNED
204	/* Define label DST_ALIGNED to BIG_BLOCK.  It will go to aligned copy
205	   once destination is adjusted to aligned.  */
206#define Ldst_aligned Lbig_block
207
208	/* Copy word by word using LDR when alignment can be done in hardware,
209	i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
210
211	cmp	r2, #8
212	blo	.Lbyte_copy
213
214	/* if src is aligned, just go to the big block loop.  */
215	lsls	r3, r1, #30
216	beq	.Ldst_aligned
217#else
218	/* if len < 12, misalignment adjustment has more overhead than
219	just byte-to-byte copy.  Also, len must >=8 to guarantee code
220	afterward work correctly.  */
221	cmp	r2, #12
222	blo	.Lbyte_copy
223#endif /* __ARM_FEATURE_UNALIGNED */
224
225	/* Align dst only, not trying to align src.  That is the because
226	handling of aligned src and misaligned dst need more overhead than
227	otherwise.  By doing this the worst case is when initial src is aligned,
228	additional up to 4 byte additional copy will executed, which is
229	acceptable.  */
230
231	ands	r3, r0, #3
232	beq	.Ldst_aligned
233
234	rsb	r3, #4
235	subs	r2, r3
236
237	lsls    r3, r3, #31
238	itt ne
239	ldrbne  r3, [r1], #1
240	strbne  r3, [r0], #1
241
242	bcc .Ldst_aligned
243
244#ifdef __ARM_FEATURE_UNALIGNED
245	ldrh    r3, [r1], #2
246	strh    r3, [r0], #2
247	b	.Ldst_aligned
248#else
249	ldrb    r3, [r1], #1
250	strb    r3, [r0], #1
251	ldrb    r3, [r1], #1
252	strb    r3, [r0], #1
253	/* Now that dst is aligned */
254.Ldst_aligned:
255	/* if r1 is aligned now, it means r0/r1 has the same misalignment,
256	and they are both aligned now.  Go aligned copy.  */
257	ands	r3, r1, #3
258	beq	.Lbig_block
259
260	/* dst is aligned, but src isn't.  Misaligned copy.  */
261
262	push	{r4, r5}
263	.cfi_adjust_cfa_offset 8
264	.cfi_rel_offset 4, 0
265	.cfi_rel_offset 5, 4
266	subs	r2, #4
267
268	/* Backward r1 by misaligned bytes, to make r1 aligned.
269	Since we need to restore r1 to unaligned address after the loop,
270	we need keep the offset bytes to ip and sub it from r1 afterward.  */
271	subs	r1, r3
272	rsb	ip, r3, #4
273
274	/* Pre-load on word */
275	ldr	r4, [r1], #4
276
277	cmp	r3, #2
278	beq	.Lmisaligned_copy_2_2
279	cmp	r3, #3
280	beq	.Lmisaligned_copy_3_1
281
282	.macro mis_src_copy shift
2831:
284#ifdef __ARM_BIG_ENDIAN
285	lsls	r4, r4, \shift
286#else
287	lsrs	r4, r4, \shift
288#endif
289	ldr	r3, [r1], #4
290#ifdef __ARM_BIG_ENDIAN
291	lsrs	r5, r3, 32-\shift
292#else
293	lsls	r5, r3, 32-\shift
294#endif
295	orr	r4, r4, r5
296	str	r4, [r0], #4
297	mov	r4, r3
298	subs	r2, #4
299	bhs	1b
300	.endm
301
302.Lmisaligned_copy_1_3:
303	mis_src_copy shift=8
304	b	.Lsrc_misaligned_tail
305
306.Lmisaligned_copy_3_1:
307	mis_src_copy shift=24
308	b	.Lsrc_misaligned_tail
309
310.Lmisaligned_copy_2_2:
311	/* For 2_2 misalignment, ldr is still faster than 2 x ldrh.  */
312	mis_src_copy shift=16
313
314.Lsrc_misaligned_tail:
315	adds	r2, #4
316	subs	r1, ip
317	pop	{r4, r5}
318	.cfi_restore 4
319	.cfi_restore 5
320	.cfi_adjust_cfa_offset -8
321
322#endif /* __ARM_FEATURE_UNALIGNED */
323
324.Lbyte_copy:
325	subs	r2, #4
326	blo	.Lcopy_less_than_4
327
328.Lbyte_copy_loop:
329	subs    r2, #1
330	ldrb    r3, [r1], #1
331	strb    r3, [r0], #1
332	bhs	.Lbyte_copy_loop
333
334	ldrb	r3, [r1]
335	strb	r3, [r0]
336	ldrb	r3, [r1, #1]
337	strb	r3, [r0, #1]
338	ldrb	r3, [r1, #2]
339	strb	r3, [r0, #2]
340
341#ifdef __ARM_FEATURE_UNALIGNED
342	mov	r0, ip
343	epilogue push_ip=HAVE_PAC_LEAF
344#else
345	epilogue 0 push_ip=HAVE_PAC_LEAF
346#endif /* __ARM_FEATURE_UNALIGNED */
347	.cfi_endproc
348	.cantunwind
349	.fnend
350	.size	memcpy, .-memcpy
351