1/* Copyright (c) 2013, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7
8      * Redistributions of source code must retain the above copyright
9      notice, this list of conditions and the following disclaimer.
10
11      * Redistributions in binary form must reproduce the above copyright
12      notice, this list of conditions and the following disclaimer in the
13      documentation and/or other materials provided with the distribution.
14
15      * Neither the name of Linaro Limited nor the names of its
16      contributors may be used to endorse or promote products derived
17      from this software without specific prior written permission.
18
19   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
32   of VFP or NEON when built with the appropriate flags.
33
34   Assumptions:
35
36    ARMv6 (ARMv7-a if using Neon)
37    ARM state
38    Unaligned accesses
39    LDRD/STRD support unaligned word accesses
40
41   If compiled with GCC, this file should be enclosed within following
42   pre-processing check:
43   if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED)
44
45 */
46#include <picolibc.h>
47
48#include "arm_asm.h"
49
50	.syntax unified
51	/* This implementation requires ARM state.  */
52	.arm
53
54#ifdef __ARM_NEON__
55
56	.fpu	neon
57	.arch	armv7-a
58# define FRAME_SIZE	4
59# define USE_VFP
60# define USE_NEON
61
62#elif __ARM_FP != 0
63
64	.arch	armv6
65	.fpu	vfpv2
66# define FRAME_SIZE	32
67# define USE_VFP
68
69#else
70	.arch	armv6
71# define FRAME_SIZE    32
72
73#endif
74
75/* Old versions of GAS incorrectly implement the NEON align semantics.  */
76#ifdef BROKEN_ASM_NEON_ALIGN
77#define ALIGN(addr, align) addr,:align
78#else
79#define ALIGN(addr, align) addr:align
80#endif
81
82#define PC_OFFSET	8	/* PC pipeline compensation.  */
83#define INSN_SIZE	4
84
85/* Call parameters.  */
86#define dstin	r0
87#define src	r1
88#define count	r2
89
90/* Locals.  */
91#define tmp1	r3
92#define dst	ip
93#define tmp2	r10
94
95#ifndef USE_NEON
96/* For bulk copies using GP registers.  */
97#define	A_l	r2		/* Call-clobbered.  */
98#define	A_h	r3		/* Call-clobbered.  */
99#define	B_l	r4
100#define	B_h	r5
101#define	C_l	r6
102#define	C_h	r7
103#define	D_l	r8
104#define	D_h	r9
105#endif
106
107/* Number of lines ahead to pre-fetch data.  If you change this the code
108   below will need adjustment to compensate.  */
109
110#define prefetch_lines	5
111
112#ifdef USE_VFP
113	.macro	cpy_line_vfp vreg, base
114	vstr	\vreg, [dst, #\base]
115	vldr	\vreg, [src, #\base]
116	vstr	d0, [dst, #\base + 8]
117	vldr	d0, [src, #\base + 8]
118	vstr	d1, [dst, #\base + 16]
119	vldr	d1, [src, #\base + 16]
120	vstr	d2, [dst, #\base + 24]
121	vldr	d2, [src, #\base + 24]
122	vstr	\vreg, [dst, #\base + 32]
123	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
124	vstr	d0, [dst, #\base + 40]
125	vldr	d0, [src, #\base + 40]
126	vstr	d1, [dst, #\base + 48]
127	vldr	d1, [src, #\base + 48]
128	vstr	d2, [dst, #\base + 56]
129	vldr	d2, [src, #\base + 56]
130	.endm
131
132	.macro	cpy_tail_vfp vreg, base
133	vstr	\vreg, [dst, #\base]
134	vldr	\vreg, [src, #\base]
135	vstr	d0, [dst, #\base + 8]
136	vldr	d0, [src, #\base + 8]
137	vstr	d1, [dst, #\base + 16]
138	vldr	d1, [src, #\base + 16]
139	vstr	d2, [dst, #\base + 24]
140	vldr	d2, [src, #\base + 24]
141	vstr	\vreg, [dst, #\base + 32]
142	vstr	d0, [dst, #\base + 40]
143	vldr	d0, [src, #\base + 40]
144	vstr	d1, [dst, #\base + 48]
145	vldr	d1, [src, #\base + 48]
146	vstr	d2, [dst, #\base + 56]
147	vldr	d2, [src, #\base + 56]
148	.endm
149#endif
150
151	.macro def_fn f p2align=0
152	.text
153	.p2align \p2align
154	.global \f
155	.type \f, %function
156\f:
157	.endm
158
159def_fn memcpy p2align=6
160	ASM_ALIAS __aeabi_memcpy, memcpy
161	ASM_ALIAS __aeabi_memcpy4, memcpy
162	ASM_ALIAS __aeabi_memcpy8, memcpy
163
164	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
165	cmp	count, #64
166	bge	.Lcpy_not_short
167	/* Deal with small copies quickly by dropping straight into the
168	   exit block.  */
169
170.Ltail63unaligned:
171#ifdef USE_NEON
172	and	tmp1, count, #0x38
173	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
174	add	pc, pc, tmp1
175	vld1.8	{d0}, [src]!	/* 14 words to go.  */
176	vst1.8	{d0}, [dst]!
177	vld1.8	{d0}, [src]!	/* 12 words to go.  */
178	vst1.8	{d0}, [dst]!
179	vld1.8	{d0}, [src]!	/* 10 words to go.  */
180	vst1.8	{d0}, [dst]!
181	vld1.8	{d0}, [src]!	/* 8 words to go.  */
182	vst1.8	{d0}, [dst]!
183	vld1.8	{d0}, [src]!	/* 6 words to go.  */
184	vst1.8	{d0}, [dst]!
185	vld1.8	{d0}, [src]!	/* 4 words to go.  */
186	vst1.8	{d0}, [dst]!
187	vld1.8	{d0}, [src]!	/* 2 words to go.  */
188	vst1.8	{d0}, [dst]!
189
190	tst	count, #4
191	ldrne	tmp1, [src], #4
192	strne	tmp1, [dst], #4
193#else
194	/* Copy up to 15 full words of data.  May not be aligned.  */
195	/* Cannot use VFP for unaligned data.  */
196	and	tmp1, count, #0x3c
197	add	dst, dst, tmp1
198	add	src, src, tmp1
199	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
200	/* Jump directly into the sequence below at the correct offset.  */
201	add	pc, pc, tmp1, lsl #1
202
203	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
204	str	tmp1, [dst, #-60]
205
206	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
207	str	tmp1, [dst, #-56]
208	ldr	tmp1, [src, #-52]
209	str	tmp1, [dst, #-52]
210
211	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
212	str	tmp1, [dst, #-48]
213	ldr	tmp1, [src, #-44]
214	str	tmp1, [dst, #-44]
215
216	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
217	str	tmp1, [dst, #-40]
218	ldr	tmp1, [src, #-36]
219	str	tmp1, [dst, #-36]
220
221	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
222	str	tmp1, [dst, #-32]
223	ldr	tmp1, [src, #-28]
224	str	tmp1, [dst, #-28]
225
226	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
227	str	tmp1, [dst, #-24]
228	ldr	tmp1, [src, #-20]
229	str	tmp1, [dst, #-20]
230
231	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
232	str	tmp1, [dst, #-16]
233	ldr	tmp1, [src, #-12]
234	str	tmp1, [dst, #-12]
235
236	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
237	str	tmp1, [dst, #-8]
238	ldr	tmp1, [src, #-4]
239	str	tmp1, [dst, #-4]
240#endif
241
242	lsls	count, count, #31
243	ldrhcs	tmp1, [src], #2
244	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
245	strhcs	tmp1, [dst], #2
246	strbne	src, [dst]
247	bx	lr
248
249.Lcpy_not_short:
250	/* At least 64 bytes to copy, but don't know the alignment yet.  */
251	str	tmp2, [sp, #-FRAME_SIZE]!
252	and	tmp2, src, #7
253	and	tmp1, dst, #7
254	cmp	tmp1, tmp2
255	bne	.Lcpy_notaligned
256
257#ifdef USE_VFP
258	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
259	   that the FP pipeline is much better at streaming loads and
260	   stores.  This is outside the critical loop.  */
261	vmov.f32	s0, s0
262#endif
263
264	/* SRC and DST have the same mutual 32-bit alignment, but we may
265	   still need to pre-copy some bytes to get to natural alignment.
266	   We bring DST into full 64-bit alignment.  */
267	lsls	tmp2, dst, #29
268	beq	1f
269	rsbs	tmp2, tmp2, #0
270	sub	count, count, tmp2, lsr #29
271	ldrmi	tmp1, [src], #4
272	strmi	tmp1, [dst], #4
273	lsls	tmp2, tmp2, #2
274	ldrhcs	tmp1, [src], #2
275	ldrbne	tmp2, [src], #1
276	strhcs	tmp1, [dst], #2
277	strbne	tmp2, [dst], #1
278
2791:
280	subs	tmp2, count, #64	/* Use tmp2 for count.  */
281	blt	.Ltail63aligned
282
283	cmp	tmp2, #512
284	bge	.Lcpy_body_long
285
286.Lcpy_body_medium:			/* Count in tmp2.  */
287#ifdef USE_VFP
2881:
289	vldr	d0, [src, #0]
290	subs	tmp2, tmp2, #64
291	vldr	d1, [src, #8]
292	vstr	d0, [dst, #0]
293	vldr	d0, [src, #16]
294	vstr	d1, [dst, #8]
295	vldr	d1, [src, #24]
296	vstr	d0, [dst, #16]
297	vldr	d0, [src, #32]
298	vstr	d1, [dst, #24]
299	vldr	d1, [src, #40]
300	vstr	d0, [dst, #32]
301	vldr	d0, [src, #48]
302	vstr	d1, [dst, #40]
303	vldr	d1, [src, #56]
304	vstr	d0, [dst, #48]
305	add	src, src, #64
306	vstr	d1, [dst, #56]
307	add	dst, dst, #64
308	bge	1b
309	tst	tmp2, #0x3f
310	beq	.Ldone
311
312.Ltail63aligned:			/* Count in tmp2.  */
313	and	tmp1, tmp2, #0x38
314	add	dst, dst, tmp1
315	add	src, src, tmp1
316	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
317	add	pc, pc, tmp1
318
319	vldr	d0, [src, #-56]	/* 14 words to go.  */
320	vstr	d0, [dst, #-56]
321	vldr	d0, [src, #-48]	/* 12 words to go.  */
322	vstr	d0, [dst, #-48]
323	vldr	d0, [src, #-40]	/* 10 words to go.  */
324	vstr	d0, [dst, #-40]
325	vldr	d0, [src, #-32]	/* 8 words to go.  */
326	vstr	d0, [dst, #-32]
327	vldr	d0, [src, #-24]	/* 6 words to go.  */
328	vstr	d0, [dst, #-24]
329	vldr	d0, [src, #-16]	/* 4 words to go.  */
330	vstr	d0, [dst, #-16]
331	vldr	d0, [src, #-8]	/* 2 words to go.  */
332	vstr	d0, [dst, #-8]
333#else
334	sub	src, src, #8
335	sub	dst, dst, #8
3361:
337	ldrd	A_l, A_h, [src, #8]
338	strd	A_l, A_h, [dst, #8]
339	ldrd	A_l, A_h, [src, #16]
340	strd	A_l, A_h, [dst, #16]
341	ldrd	A_l, A_h, [src, #24]
342	strd	A_l, A_h, [dst, #24]
343	ldrd	A_l, A_h, [src, #32]
344	strd	A_l, A_h, [dst, #32]
345	ldrd	A_l, A_h, [src, #40]
346	strd	A_l, A_h, [dst, #40]
347	ldrd	A_l, A_h, [src, #48]
348	strd	A_l, A_h, [dst, #48]
349	ldrd	A_l, A_h, [src, #56]
350	strd	A_l, A_h, [dst, #56]
351	ldrd	A_l, A_h, [src, #64]!
352	strd	A_l, A_h, [dst, #64]!
353	subs	tmp2, tmp2, #64
354	bge	1b
355	tst	tmp2, #0x3f
356	bne	1f
357	ldr	tmp2,[sp], #FRAME_SIZE
358	bx	lr
3591:
360	add	src, src, #8
361	add	dst, dst, #8
362
363.Ltail63aligned:			/* Count in tmp2.  */
364	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
365	   we know that the src and dest are 32-bit aligned so we can use
366	   LDRD/STRD to improve efficiency.  */
367	/* TMP2 is now negative, but we don't care about that.  The bottom
368	   six bits still tell us how many bytes are left to copy.  */
369
370	and	tmp1, tmp2, #0x38
371	add	dst, dst, tmp1
372	add	src, src, tmp1
373	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
374	add	pc, pc, tmp1
375	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
376	strd	A_l, A_h, [dst, #-56]
377	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
378	strd	A_l, A_h, [dst, #-48]
379	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
380	strd	A_l, A_h, [dst, #-40]
381	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
382	strd	A_l, A_h, [dst, #-32]
383	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
384	strd	A_l, A_h, [dst, #-24]
385	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
386	strd	A_l, A_h, [dst, #-16]
387	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
388	strd	A_l, A_h, [dst, #-8]
389
390#endif
391	tst	tmp2, #4
392	ldrne	tmp1, [src], #4
393	strne	tmp1, [dst], #4
394	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
395	ldrhcs	tmp1, [src], #2
396	ldrbne	tmp2, [src]
397	strhcs	tmp1, [dst], #2
398	strbne	tmp2, [dst]
399
400.Ldone:
401	ldr	tmp2, [sp], #FRAME_SIZE
402	bx	lr
403
404.Lcpy_body_long:			/* Count in tmp2.  */
405
406	/* Long copy.  We know that there's at least (prefetch_lines * 64)
407	   bytes to go.  */
408#ifdef USE_VFP
409	/* Don't use PLD.  Instead, read some data in advance of the current
410	   copy position into a register.  This should act like a PLD
411	   operation but we won't have to repeat the transfer.  */
412
413	vldr	d3, [src, #0]
414	vldr	d4, [src, #64]
415	vldr	d5, [src, #128]
416	vldr	d6, [src, #192]
417	vldr	d7, [src, #256]
418
419	vldr	d0, [src, #8]
420	vldr	d1, [src, #16]
421	vldr	d2, [src, #24]
422	add	src, src, #32
423
424	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
425	blt	2f
4261:
427	cpy_line_vfp	d3, 0
428	cpy_line_vfp	d4, 64
429	cpy_line_vfp	d5, 128
430	add	dst, dst, #3 * 64
431	add	src, src, #3 * 64
432	cpy_line_vfp	d6, 0
433	cpy_line_vfp	d7, 64
434	add	dst, dst, #2 * 64
435	add	src, src, #2 * 64
436	subs	tmp2, tmp2, #prefetch_lines * 64
437	bge	1b
438
4392:
440	cpy_tail_vfp	d3, 0
441	cpy_tail_vfp	d4, 64
442	cpy_tail_vfp	d5, 128
443	add	src, src, #3 * 64
444	add	dst, dst, #3 * 64
445	cpy_tail_vfp	d6, 0
446	vstr	d7, [dst, #64]
447	vldr	d7, [src, #64]
448	vstr	d0, [dst, #64 + 8]
449	vldr	d0, [src, #64 + 8]
450	vstr	d1, [dst, #64 + 16]
451	vldr	d1, [src, #64 + 16]
452	vstr	d2, [dst, #64 + 24]
453	vldr	d2, [src, #64 + 24]
454	vstr	d7, [dst, #64 + 32]
455	add	src, src, #96
456	vstr	d0, [dst, #64 + 40]
457	vstr	d1, [dst, #64 + 48]
458	vstr	d2, [dst, #64 + 56]
459	add	dst, dst, #128
460	add	tmp2, tmp2, #prefetch_lines * 64
461	b	.Lcpy_body_medium
462#else
463	/* Long copy.  Use an SMS style loop to maximize the I/O
464	   bandwidth of the core.  We don't have enough spare registers
465	   to synthesise prefetching, so use PLD operations.  */
466	/* Pre-bias src and dst.  */
467	sub	src, src, #8
468	sub	dst, dst, #8
469	pld	[src, #8]
470	pld	[src, #72]
471	subs	tmp2, tmp2, #64
472	pld	[src, #136]
473	ldrd	A_l, A_h, [src, #8]
474	strd	B_l, B_h, [sp, #8]
475	ldrd	B_l, B_h, [src, #16]
476	strd	C_l, C_h, [sp, #16]
477	ldrd	C_l, C_h, [src, #24]
478	strd	D_l, D_h, [sp, #24]
479	pld	[src, #200]
480	ldrd	D_l, D_h, [src, #32]!
481	b	1f
482	.p2align	6
4832:
484	pld	[src, #232]
485	strd	A_l, A_h, [dst, #40]
486	ldrd	A_l, A_h, [src, #40]
487	strd	B_l, B_h, [dst, #48]
488	ldrd	B_l, B_h, [src, #48]
489	strd	C_l, C_h, [dst, #56]
490	ldrd	C_l, C_h, [src, #56]
491	strd	D_l, D_h, [dst, #64]!
492	ldrd	D_l, D_h, [src, #64]!
493	subs	tmp2, tmp2, #64
4941:
495	strd	A_l, A_h, [dst, #8]
496	ldrd	A_l, A_h, [src, #8]
497	strd	B_l, B_h, [dst, #16]
498	ldrd	B_l, B_h, [src, #16]
499	strd	C_l, C_h, [dst, #24]
500	ldrd	C_l, C_h, [src, #24]
501	strd	D_l, D_h, [dst, #32]
502	ldrd	D_l, D_h, [src, #32]
503	bcs	2b
504	/* Save the remaining bytes and restore the callee-saved regs.  */
505	strd	A_l, A_h, [dst, #40]
506	add	src, src, #40
507	strd	B_l, B_h, [dst, #48]
508	ldrd	B_l, B_h, [sp, #8]
509	strd	C_l, C_h, [dst, #56]
510	ldrd	C_l, C_h, [sp, #16]
511	strd	D_l, D_h, [dst, #64]
512	ldrd	D_l, D_h, [sp, #24]
513	add	dst, dst, #72
514	tst	tmp2, #0x3f
515	bne	.Ltail63aligned
516	ldr	tmp2, [sp], #FRAME_SIZE
517	bx	lr
518#endif
519
520.Lcpy_notaligned:
521	pld	[src]
522	pld	[src, #64]
523	/* There's at least 64 bytes to copy, but there is no mutual
524	   alignment.  */
525	/* Bring DST to 64-bit alignment.  */
526	lsls	tmp2, dst, #29
527	pld	[src, #(2 * 64)]
528	beq	1f
529	rsbs	tmp2, tmp2, #0
530	sub	count, count, tmp2, lsr #29
531	ldrmi	tmp1, [src], #4
532	strmi	tmp1, [dst], #4
533	lsls	tmp2, tmp2, #2
534	ldrbne	tmp1, [src], #1
535	ldrhcs	tmp2, [src], #2
536	strbne	tmp1, [dst], #1
537	strhcs	tmp2, [dst], #2
5381:
539	pld	[src, #(3 * 64)]
540	subs	count, count, #64
541	ldrmi	tmp2, [sp], #FRAME_SIZE
542	bmi	.Ltail63unaligned
543	pld	[src, #(4 * 64)]
544
545#ifdef USE_NEON
546	vld1.8	{d0-d3}, [src]!
547	vld1.8	{d4-d7}, [src]!
548	subs	count, count, #64
549	bmi	2f
5501:
551	pld	[src, #(4 * 64)]
552	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
553	vld1.8	{d0-d3}, [src]!
554	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
555	vld1.8	{d4-d7}, [src]!
556	subs	count, count, #64
557	bpl	1b
5582:
559	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
560	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
561	ands	count, count, #0x3f
562#else
563	/* Use an SMS style loop to maximize the I/O bandwidth.  */
564	sub	src, src, #4
565	sub	dst, dst, #8
566	subs	tmp2, count, #64	/* Use tmp2 for count.  */
567	ldr	A_l, [src, #4]
568	ldr	A_h, [src, #8]
569	strd	B_l, B_h, [sp, #8]
570	ldr	B_l, [src, #12]
571	ldr	B_h, [src, #16]
572	strd	C_l, C_h, [sp, #16]
573	ldr	C_l, [src, #20]
574	ldr	C_h, [src, #24]
575	strd	D_l, D_h, [sp, #24]
576	ldr	D_l, [src, #28]
577	ldr	D_h, [src, #32]!
578	b	1f
579	.p2align	6
5802:
581	pld	[src, #(5 * 64) - (32 - 4)]
582	strd	A_l, A_h, [dst, #40]
583	ldr	A_l, [src, #36]
584	ldr	A_h, [src, #40]
585	strd	B_l, B_h, [dst, #48]
586	ldr	B_l, [src, #44]
587	ldr	B_h, [src, #48]
588	strd	C_l, C_h, [dst, #56]
589	ldr	C_l, [src, #52]
590	ldr	C_h, [src, #56]
591	strd	D_l, D_h, [dst, #64]!
592	ldr	D_l, [src, #60]
593	ldr	D_h, [src, #64]!
594	subs	tmp2, tmp2, #64
5951:
596	strd	A_l, A_h, [dst, #8]
597	ldr	A_l, [src, #4]
598	ldr	A_h, [src, #8]
599	strd	B_l, B_h, [dst, #16]
600	ldr	B_l, [src, #12]
601	ldr	B_h, [src, #16]
602	strd	C_l, C_h, [dst, #24]
603	ldr	C_l, [src, #20]
604	ldr	C_h, [src, #24]
605	strd	D_l, D_h, [dst, #32]
606	ldr	D_l, [src, #28]
607	ldr	D_h, [src, #32]
608	bcs	2b
609
610	/* Save the remaining bytes and restore the callee-saved regs.  */
611	strd	A_l, A_h, [dst, #40]
612	add	src, src, #36
613	strd	B_l, B_h, [dst, #48]
614	ldrd	B_l, B_h, [sp, #8]
615	strd	C_l, C_h, [dst, #56]
616	ldrd	C_l, C_h, [sp, #16]
617	strd	D_l, D_h, [dst, #64]
618	ldrd	D_l, D_h, [sp, #24]
619	add	dst, dst, #72
620	ands	count, tmp2, #0x3f
621#endif
622	ldr	tmp2, [sp], #FRAME_SIZE
623	bne	.Ltail63unaligned
624	bx	lr
625
626	.size	memcpy, . - memcpy
627