1/* Copyright (c) 2013, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions
6   are met:
7
8      * Redistributions of source code must retain the above copyright
9      notice, this list of conditions and the following disclaimer.
10
11      * Redistributions in binary form must reproduce the above copyright
12      notice, this list of conditions and the following disclaimer in the
13      documentation and/or other materials provided with the distribution.
14
15      * Neither the name of Linaro Limited nor the names of its
16      contributors may be used to endorse or promote products derived
17      from this software without specific prior written permission.
18
19   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
32   of VFP or NEON when built with the appropriate flags.
33
34   Assumptions:
35
36    ARMv6 (ARMv7-a if using Neon)
37    ARM state
38    Unaligned accesses
39    LDRD/STRD support unaligned word accesses
40
41   If compiled with GCC, this file should be enclosed within following
42   pre-processing check:
43   if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED)
44
45 */
46	.syntax unified
47	/* This implementation requires ARM state.  */
48	.arm
49
50#ifdef __ARM_NEON__
51
52	.fpu	neon
53	.arch	armv7-a
54# define FRAME_SIZE	4
55# define USE_VFP
56# define USE_NEON
57
58#elif !defined (__SOFTFP__)
59
60	.arch	armv6
61	.fpu	vfpv2
62# define FRAME_SIZE	32
63# define USE_VFP
64
65#else
66	.arch	armv6
67# define FRAME_SIZE    32
68
69#endif
70
71/* Old versions of GAS incorrectly implement the NEON align semantics.  */
72#ifdef BROKEN_ASM_NEON_ALIGN
73#define ALIGN(addr, align) addr,:align
74#else
75#define ALIGN(addr, align) addr:align
76#endif
77
78#define PC_OFFSET	8	/* PC pipeline compensation.  */
79#define INSN_SIZE	4
80
81/* Call parameters.  */
82#define dstin	r0
83#define src	r1
84#define count	r2
85
86/* Locals.  */
87#define tmp1	r3
88#define dst	ip
89#define tmp2	r10
90
91#ifndef USE_NEON
92/* For bulk copies using GP registers.  */
93#define	A_l	r2		/* Call-clobbered.  */
94#define	A_h	r3		/* Call-clobbered.  */
95#define	B_l	r4
96#define	B_h	r5
97#define	C_l	r6
98#define	C_h	r7
99#define	D_l	r8
100#define	D_h	r9
101#endif
102
103/* Number of lines ahead to pre-fetch data.  If you change this the code
104   below will need adjustment to compensate.  */
105
106#define prefetch_lines	5
107
108#ifdef USE_VFP
109	.macro	cpy_line_vfp vreg, base
110	vstr	\vreg, [dst, #\base]
111	vldr	\vreg, [src, #\base]
112	vstr	d0, [dst, #\base + 8]
113	vldr	d0, [src, #\base + 8]
114	vstr	d1, [dst, #\base + 16]
115	vldr	d1, [src, #\base + 16]
116	vstr	d2, [dst, #\base + 24]
117	vldr	d2, [src, #\base + 24]
118	vstr	\vreg, [dst, #\base + 32]
119	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
120	vstr	d0, [dst, #\base + 40]
121	vldr	d0, [src, #\base + 40]
122	vstr	d1, [dst, #\base + 48]
123	vldr	d1, [src, #\base + 48]
124	vstr	d2, [dst, #\base + 56]
125	vldr	d2, [src, #\base + 56]
126	.endm
127
128	.macro	cpy_tail_vfp vreg, base
129	vstr	\vreg, [dst, #\base]
130	vldr	\vreg, [src, #\base]
131	vstr	d0, [dst, #\base + 8]
132	vldr	d0, [src, #\base + 8]
133	vstr	d1, [dst, #\base + 16]
134	vldr	d1, [src, #\base + 16]
135	vstr	d2, [dst, #\base + 24]
136	vldr	d2, [src, #\base + 24]
137	vstr	\vreg, [dst, #\base + 32]
138	vstr	d0, [dst, #\base + 40]
139	vldr	d0, [src, #\base + 40]
140	vstr	d1, [dst, #\base + 48]
141	vldr	d1, [src, #\base + 48]
142	vstr	d2, [dst, #\base + 56]
143	vldr	d2, [src, #\base + 56]
144	.endm
145#endif
146
147	.macro def_fn f p2align=0
148	.text
149	.p2align \p2align
150	.global \f
151	.type \f, %function
152\f:
153	.endm
154
155def_fn memcpy p2align=6
156
157	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
158	cmp	count, #64
159	bge	.Lcpy_not_short
160	/* Deal with small copies quickly by dropping straight into the
161	   exit block.  */
162
163.Ltail63unaligned:
164#ifdef USE_NEON
165	and	tmp1, count, #0x38
166	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
167	add	pc, pc, tmp1
168	vld1.8	{d0}, [src]!	/* 14 words to go.  */
169	vst1.8	{d0}, [dst]!
170	vld1.8	{d0}, [src]!	/* 12 words to go.  */
171	vst1.8	{d0}, [dst]!
172	vld1.8	{d0}, [src]!	/* 10 words to go.  */
173	vst1.8	{d0}, [dst]!
174	vld1.8	{d0}, [src]!	/* 8 words to go.  */
175	vst1.8	{d0}, [dst]!
176	vld1.8	{d0}, [src]!	/* 6 words to go.  */
177	vst1.8	{d0}, [dst]!
178	vld1.8	{d0}, [src]!	/* 4 words to go.  */
179	vst1.8	{d0}, [dst]!
180	vld1.8	{d0}, [src]!	/* 2 words to go.  */
181	vst1.8	{d0}, [dst]!
182
183	tst	count, #4
184	ldrne	tmp1, [src], #4
185	strne	tmp1, [dst], #4
186#else
187	/* Copy up to 15 full words of data.  May not be aligned.  */
188	/* Cannot use VFP for unaligned data.  */
189	and	tmp1, count, #0x3c
190	add	dst, dst, tmp1
191	add	src, src, tmp1
192	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
193	/* Jump directly into the sequence below at the correct offset.  */
194	add	pc, pc, tmp1, lsl #1
195
196	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
197	str	tmp1, [dst, #-60]
198
199	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
200	str	tmp1, [dst, #-56]
201	ldr	tmp1, [src, #-52]
202	str	tmp1, [dst, #-52]
203
204	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
205	str	tmp1, [dst, #-48]
206	ldr	tmp1, [src, #-44]
207	str	tmp1, [dst, #-44]
208
209	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
210	str	tmp1, [dst, #-40]
211	ldr	tmp1, [src, #-36]
212	str	tmp1, [dst, #-36]
213
214	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
215	str	tmp1, [dst, #-32]
216	ldr	tmp1, [src, #-28]
217	str	tmp1, [dst, #-28]
218
219	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
220	str	tmp1, [dst, #-24]
221	ldr	tmp1, [src, #-20]
222	str	tmp1, [dst, #-20]
223
224	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
225	str	tmp1, [dst, #-16]
226	ldr	tmp1, [src, #-12]
227	str	tmp1, [dst, #-12]
228
229	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
230	str	tmp1, [dst, #-8]
231	ldr	tmp1, [src, #-4]
232	str	tmp1, [dst, #-4]
233#endif
234
235	lsls	count, count, #31
236	ldrhcs	tmp1, [src], #2
237	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
238	strhcs	tmp1, [dst], #2
239	strbne	src, [dst]
240	bx	lr
241
242.Lcpy_not_short:
243	/* At least 64 bytes to copy, but don't know the alignment yet.  */
244	str	tmp2, [sp, #-FRAME_SIZE]!
245	and	tmp2, src, #7
246	and	tmp1, dst, #7
247	cmp	tmp1, tmp2
248	bne	.Lcpy_notaligned
249
250#ifdef USE_VFP
251	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
252	   that the FP pipeline is much better at streaming loads and
253	   stores.  This is outside the critical loop.  */
254	vmov.f32	s0, s0
255#endif
256
257	/* SRC and DST have the same mutual 32-bit alignment, but we may
258	   still need to pre-copy some bytes to get to natural alignment.
259	   We bring DST into full 64-bit alignment.  */
260	lsls	tmp2, dst, #29
261	beq	1f
262	rsbs	tmp2, tmp2, #0
263	sub	count, count, tmp2, lsr #29
264	ldrmi	tmp1, [src], #4
265	strmi	tmp1, [dst], #4
266	lsls	tmp2, tmp2, #2
267	ldrhcs	tmp1, [src], #2
268	ldrbne	tmp2, [src], #1
269	strhcs	tmp1, [dst], #2
270	strbne	tmp2, [dst], #1
271
2721:
273	subs	tmp2, count, #64	/* Use tmp2 for count.  */
274	blt	.Ltail63aligned
275
276	cmp	tmp2, #512
277	bge	.Lcpy_body_long
278
279.Lcpy_body_medium:			/* Count in tmp2.  */
280#ifdef USE_VFP
2811:
282	vldr	d0, [src, #0]
283	subs	tmp2, tmp2, #64
284	vldr	d1, [src, #8]
285	vstr	d0, [dst, #0]
286	vldr	d0, [src, #16]
287	vstr	d1, [dst, #8]
288	vldr	d1, [src, #24]
289	vstr	d0, [dst, #16]
290	vldr	d0, [src, #32]
291	vstr	d1, [dst, #24]
292	vldr	d1, [src, #40]
293	vstr	d0, [dst, #32]
294	vldr	d0, [src, #48]
295	vstr	d1, [dst, #40]
296	vldr	d1, [src, #56]
297	vstr	d0, [dst, #48]
298	add	src, src, #64
299	vstr	d1, [dst, #56]
300	add	dst, dst, #64
301	bge	1b
302	tst	tmp2, #0x3f
303	beq	.Ldone
304
305.Ltail63aligned:			/* Count in tmp2.  */
306	and	tmp1, tmp2, #0x38
307	add	dst, dst, tmp1
308	add	src, src, tmp1
309	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
310	add	pc, pc, tmp1
311
312	vldr	d0, [src, #-56]	/* 14 words to go.  */
313	vstr	d0, [dst, #-56]
314	vldr	d0, [src, #-48]	/* 12 words to go.  */
315	vstr	d0, [dst, #-48]
316	vldr	d0, [src, #-40]	/* 10 words to go.  */
317	vstr	d0, [dst, #-40]
318	vldr	d0, [src, #-32]	/* 8 words to go.  */
319	vstr	d0, [dst, #-32]
320	vldr	d0, [src, #-24]	/* 6 words to go.  */
321	vstr	d0, [dst, #-24]
322	vldr	d0, [src, #-16]	/* 4 words to go.  */
323	vstr	d0, [dst, #-16]
324	vldr	d0, [src, #-8]	/* 2 words to go.  */
325	vstr	d0, [dst, #-8]
326#else
327	sub	src, src, #8
328	sub	dst, dst, #8
3291:
330	ldrd	A_l, A_h, [src, #8]
331	strd	A_l, A_h, [dst, #8]
332	ldrd	A_l, A_h, [src, #16]
333	strd	A_l, A_h, [dst, #16]
334	ldrd	A_l, A_h, [src, #24]
335	strd	A_l, A_h, [dst, #24]
336	ldrd	A_l, A_h, [src, #32]
337	strd	A_l, A_h, [dst, #32]
338	ldrd	A_l, A_h, [src, #40]
339	strd	A_l, A_h, [dst, #40]
340	ldrd	A_l, A_h, [src, #48]
341	strd	A_l, A_h, [dst, #48]
342	ldrd	A_l, A_h, [src, #56]
343	strd	A_l, A_h, [dst, #56]
344	ldrd	A_l, A_h, [src, #64]!
345	strd	A_l, A_h, [dst, #64]!
346	subs	tmp2, tmp2, #64
347	bge	1b
348	tst	tmp2, #0x3f
349	bne	1f
350	ldr	tmp2,[sp], #FRAME_SIZE
351	bx	lr
3521:
353	add	src, src, #8
354	add	dst, dst, #8
355
356.Ltail63aligned:			/* Count in tmp2.  */
357	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
358	   we know that the src and dest are 32-bit aligned so we can use
359	   LDRD/STRD to improve efficiency.  */
360	/* TMP2 is now negative, but we don't care about that.  The bottom
361	   six bits still tell us how many bytes are left to copy.  */
362
363	and	tmp1, tmp2, #0x38
364	add	dst, dst, tmp1
365	add	src, src, tmp1
366	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
367	add	pc, pc, tmp1
368	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
369	strd	A_l, A_h, [dst, #-56]
370	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
371	strd	A_l, A_h, [dst, #-48]
372	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
373	strd	A_l, A_h, [dst, #-40]
374	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
375	strd	A_l, A_h, [dst, #-32]
376	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
377	strd	A_l, A_h, [dst, #-24]
378	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
379	strd	A_l, A_h, [dst, #-16]
380	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
381	strd	A_l, A_h, [dst, #-8]
382
383#endif
384	tst	tmp2, #4
385	ldrne	tmp1, [src], #4
386	strne	tmp1, [dst], #4
387	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
388	ldrhcs	tmp1, [src], #2
389	ldrbne	tmp2, [src]
390	strhcs	tmp1, [dst], #2
391	strbne	tmp2, [dst]
392
393.Ldone:
394	ldr	tmp2, [sp], #FRAME_SIZE
395	bx	lr
396
397.Lcpy_body_long:			/* Count in tmp2.  */
398
399	/* Long copy.  We know that there's at least (prefetch_lines * 64)
400	   bytes to go.  */
401#ifdef USE_VFP
402	/* Don't use PLD.  Instead, read some data in advance of the current
403	   copy position into a register.  This should act like a PLD
404	   operation but we won't have to repeat the transfer.  */
405
406	vldr	d3, [src, #0]
407	vldr	d4, [src, #64]
408	vldr	d5, [src, #128]
409	vldr	d6, [src, #192]
410	vldr	d7, [src, #256]
411
412	vldr	d0, [src, #8]
413	vldr	d1, [src, #16]
414	vldr	d2, [src, #24]
415	add	src, src, #32
416
417	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
418	blt	2f
4191:
420	cpy_line_vfp	d3, 0
421	cpy_line_vfp	d4, 64
422	cpy_line_vfp	d5, 128
423	add	dst, dst, #3 * 64
424	add	src, src, #3 * 64
425	cpy_line_vfp	d6, 0
426	cpy_line_vfp	d7, 64
427	add	dst, dst, #2 * 64
428	add	src, src, #2 * 64
429	subs	tmp2, tmp2, #prefetch_lines * 64
430	bge	1b
431
4322:
433	cpy_tail_vfp	d3, 0
434	cpy_tail_vfp	d4, 64
435	cpy_tail_vfp	d5, 128
436	add	src, src, #3 * 64
437	add	dst, dst, #3 * 64
438	cpy_tail_vfp	d6, 0
439	vstr	d7, [dst, #64]
440	vldr	d7, [src, #64]
441	vstr	d0, [dst, #64 + 8]
442	vldr	d0, [src, #64 + 8]
443	vstr	d1, [dst, #64 + 16]
444	vldr	d1, [src, #64 + 16]
445	vstr	d2, [dst, #64 + 24]
446	vldr	d2, [src, #64 + 24]
447	vstr	d7, [dst, #64 + 32]
448	add	src, src, #96
449	vstr	d0, [dst, #64 + 40]
450	vstr	d1, [dst, #64 + 48]
451	vstr	d2, [dst, #64 + 56]
452	add	dst, dst, #128
453	add	tmp2, tmp2, #prefetch_lines * 64
454	b	.Lcpy_body_medium
455#else
456	/* Long copy.  Use an SMS style loop to maximize the I/O
457	   bandwidth of the core.  We don't have enough spare registers
458	   to synthesise prefetching, so use PLD operations.  */
459	/* Pre-bias src and dst.  */
460	sub	src, src, #8
461	sub	dst, dst, #8
462	pld	[src, #8]
463	pld	[src, #72]
464	subs	tmp2, tmp2, #64
465	pld	[src, #136]
466	ldrd	A_l, A_h, [src, #8]
467	strd	B_l, B_h, [sp, #8]
468	ldrd	B_l, B_h, [src, #16]
469	strd	C_l, C_h, [sp, #16]
470	ldrd	C_l, C_h, [src, #24]
471	strd	D_l, D_h, [sp, #24]
472	pld	[src, #200]
473	ldrd	D_l, D_h, [src, #32]!
474	b	1f
475	.p2align	6
4762:
477	pld	[src, #232]
478	strd	A_l, A_h, [dst, #40]
479	ldrd	A_l, A_h, [src, #40]
480	strd	B_l, B_h, [dst, #48]
481	ldrd	B_l, B_h, [src, #48]
482	strd	C_l, C_h, [dst, #56]
483	ldrd	C_l, C_h, [src, #56]
484	strd	D_l, D_h, [dst, #64]!
485	ldrd	D_l, D_h, [src, #64]!
486	subs	tmp2, tmp2, #64
4871:
488	strd	A_l, A_h, [dst, #8]
489	ldrd	A_l, A_h, [src, #8]
490	strd	B_l, B_h, [dst, #16]
491	ldrd	B_l, B_h, [src, #16]
492	strd	C_l, C_h, [dst, #24]
493	ldrd	C_l, C_h, [src, #24]
494	strd	D_l, D_h, [dst, #32]
495	ldrd	D_l, D_h, [src, #32]
496	bcs	2b
497	/* Save the remaining bytes and restore the callee-saved regs.  */
498	strd	A_l, A_h, [dst, #40]
499	add	src, src, #40
500	strd	B_l, B_h, [dst, #48]
501	ldrd	B_l, B_h, [sp, #8]
502	strd	C_l, C_h, [dst, #56]
503	ldrd	C_l, C_h, [sp, #16]
504	strd	D_l, D_h, [dst, #64]
505	ldrd	D_l, D_h, [sp, #24]
506	add	dst, dst, #72
507	tst	tmp2, #0x3f
508	bne	.Ltail63aligned
509	ldr	tmp2, [sp], #FRAME_SIZE
510	bx	lr
511#endif
512
513.Lcpy_notaligned:
514	pld	[src]
515	pld	[src, #64]
516	/* There's at least 64 bytes to copy, but there is no mutual
517	   alignment.  */
518	/* Bring DST to 64-bit alignment.  */
519	lsls	tmp2, dst, #29
520	pld	[src, #(2 * 64)]
521	beq	1f
522	rsbs	tmp2, tmp2, #0
523	sub	count, count, tmp2, lsr #29
524	ldrmi	tmp1, [src], #4
525	strmi	tmp1, [dst], #4
526	lsls	tmp2, tmp2, #2
527	ldrbne	tmp1, [src], #1
528	ldrhcs	tmp2, [src], #2
529	strbne	tmp1, [dst], #1
530	strhcs	tmp2, [dst], #2
5311:
532	pld	[src, #(3 * 64)]
533	subs	count, count, #64
534	ldrmi	tmp2, [sp], #FRAME_SIZE
535	bmi	.Ltail63unaligned
536	pld	[src, #(4 * 64)]
537
538#ifdef USE_NEON
539	vld1.8	{d0-d3}, [src]!
540	vld1.8	{d4-d7}, [src]!
541	subs	count, count, #64
542	bmi	2f
5431:
544	pld	[src, #(4 * 64)]
545	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
546	vld1.8	{d0-d3}, [src]!
547	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
548	vld1.8	{d4-d7}, [src]!
549	subs	count, count, #64
550	bpl	1b
5512:
552	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
553	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
554	ands	count, count, #0x3f
555#else
556	/* Use an SMS style loop to maximize the I/O bandwidth.  */
557	sub	src, src, #4
558	sub	dst, dst, #8
559	subs	tmp2, count, #64	/* Use tmp2 for count.  */
560	ldr	A_l, [src, #4]
561	ldr	A_h, [src, #8]
562	strd	B_l, B_h, [sp, #8]
563	ldr	B_l, [src, #12]
564	ldr	B_h, [src, #16]
565	strd	C_l, C_h, [sp, #16]
566	ldr	C_l, [src, #20]
567	ldr	C_h, [src, #24]
568	strd	D_l, D_h, [sp, #24]
569	ldr	D_l, [src, #28]
570	ldr	D_h, [src, #32]!
571	b	1f
572	.p2align	6
5732:
574	pld	[src, #(5 * 64) - (32 - 4)]
575	strd	A_l, A_h, [dst, #40]
576	ldr	A_l, [src, #36]
577	ldr	A_h, [src, #40]
578	strd	B_l, B_h, [dst, #48]
579	ldr	B_l, [src, #44]
580	ldr	B_h, [src, #48]
581	strd	C_l, C_h, [dst, #56]
582	ldr	C_l, [src, #52]
583	ldr	C_h, [src, #56]
584	strd	D_l, D_h, [dst, #64]!
585	ldr	D_l, [src, #60]
586	ldr	D_h, [src, #64]!
587	subs	tmp2, tmp2, #64
5881:
589	strd	A_l, A_h, [dst, #8]
590	ldr	A_l, [src, #4]
591	ldr	A_h, [src, #8]
592	strd	B_l, B_h, [dst, #16]
593	ldr	B_l, [src, #12]
594	ldr	B_h, [src, #16]
595	strd	C_l, C_h, [dst, #24]
596	ldr	C_l, [src, #20]
597	ldr	C_h, [src, #24]
598	strd	D_l, D_h, [dst, #32]
599	ldr	D_l, [src, #28]
600	ldr	D_h, [src, #32]
601	bcs	2b
602
603	/* Save the remaining bytes and restore the callee-saved regs.  */
604	strd	A_l, A_h, [dst, #40]
605	add	src, src, #36
606	strd	B_l, B_h, [dst, #48]
607	ldrd	B_l, B_h, [sp, #8]
608	strd	C_l, C_h, [dst, #56]
609	ldrd	C_l, C_h, [sp, #16]
610	strd	D_l, D_h, [dst, #64]
611	ldrd	D_l, D_h, [sp, #24]
612	add	dst, dst, #72
613	ands	count, tmp2, #0x3f
614#endif
615	ldr	tmp2, [sp], #FRAME_SIZE
616	bne	.Ltail63unaligned
617	bx	lr
618
619	.size	memcpy, . - memcpy
620