1/*
2 * Copyright (c) 2014 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "arm_asm.h"
30
31/* NOTE: This ifdef MUST match the one in aeabi_memcpy.c.  */
32#if !(defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) && \
33	defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \
34	(defined (__ARM_NEON__) || !defined (__SOFTFP__))
35
36	.syntax unified
37	.global __aeabi_memcpy
38	.type   __aeabi_memcpy, %function
39__aeabi_memcpy:
40	/* Assumes that n >= 0, and dst, src are valid pointers.
41          If there is at least 8 bytes to copy, use LDRD/STRD.
42          If src and dst are misaligned with different offsets,
43          first copy byte by byte until dst is aligned,
44          and then copy using LDRD/STRD and shift if needed.
45          When less than 8 left, copy a word and then byte by byte.  */
46
47       /* Save registers (r0 holds the return value):
48          optimized push {r0, r4, r5, lr}.
49          To try and improve performance, stack layout changed,
50          i.e., not keeping the stack looking like users expect
51          (highest numbered register at highest address).  */
52        push {r0, lr}
53        strd r4, r5, [sp, #-8]!
54
55        /* Get copying of tiny blocks out of the way first.  */
56        /* Is there at least 4 bytes to copy?  */
57        subs    r2, r2, #4
58        blt     copy_less_than_4       /* If n < 4.  */
59
60        /* Check word alignment.  */
61        ands    ip, r0, #3             /* ip = last 2 bits of dst.  */
62        bne     dst_not_word_aligned   /* If dst is not word-aligned.  */
63
64        /* Get here if dst is word-aligned.  */
65        ands    ip, r1, #3             /* ip = last 2 bits of src.  */
66        bne     src_not_word_aligned   /* If src is not word-aligned.  */
67word_aligned:
68        /* Get here if source and dst both are word-aligned.
69           The number of bytes remaining to copy is r2+4.  */
70
71        /* Is there is at least 64 bytes to copy?  */
72        subs    r2, r2, #60
73        blt     copy_less_than_64                /* If r2 + 4 < 64.  */
74
75        /* First, align the destination buffer to 8-bytes,
76           to make sure double loads and stores don't cross cache line boundary,
77           as they are then more expensive even if the data is in the cache
78           (require two load/store issue cycles instead of one).
79           If only one of the buffers is not 8-bytes aligned,
80           then it's more important to align dst than src,
81           because there is more penalty for stores
82           than loads that cross cacheline boundary.
83           This check and realignment are only worth doing
84           if there is a lot to copy.  */
85
86        /* Get here if dst is word aligned,
87           i.e., the 2 least significant bits are 0.
88           If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
89           then copy 1 word (4 bytes).  */
90        ands    r3, r0, #4
91        beq     two_word_aligned  /* If dst already two-word aligned.  */
92        ldr     r3, [r1], #4
93        str     r3, [r0], #4
94        subs    r2, r2, #4
95        blt     copy_less_than_64
96
97two_word_aligned:
98        /* TODO: Align to cacheline (useful for PLD optimization).  */
99
100        /* Every loop iteration copies 64 bytes.  */
1011:
102        .irp    offset, #0, #8, #16, #24, #32, #40, #48, #56
103        ldrd    r4, r5, [r1, \offset]
104        strd    r4, r5, [r0, \offset]
105        .endr
106
107        add     r0, r0, #64
108        add     r1, r1, #64
109        subs    r2, r2, #64
110        bge     1b                     /* If there is more to copy.  */
111
112copy_less_than_64:
113
114        /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
115           Restore the count if there is more than 7 bytes to copy.  */
116        adds    r2, r2, #56
117        blt     copy_less_than_8
118
119        /* Copy 8 bytes at a time.  */
1202:
121        ldrd    r4, r5, [r1], #8
122        strd    r4, r5, [r0], #8
123        subs    r2, r2, #8
124        bge     2b                     /* If there is more to copy.  */
125
126copy_less_than_8:
127
128        /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
129           Check if there is more to copy.  */
130        cmn     r2, #8
131        beq     return                          /* If r2 + 8 == 0.  */
132
133        /* Restore the count if there is more than 3 bytes to copy.  */
134        adds    r2, r2, #4
135        blt     copy_less_than_4
136
137        /* Copy 4 bytes.  */
138        ldr     r3, [r1], #4
139        str     r3, [r0], #4
140
141copy_less_than_4:
142        /* Get here if less than 4 bytes to copy, -4 <= r2 < 0.  */
143
144        /* Restore the count, check if there is more to copy.  */
145        adds    r2, r2, #4
146        beq     return                          /* If r2 == 0.  */
147
148        /* Get here with r2 is in {1,2,3}={01,10,11}.  */
149        /* Logical shift left r2, insert 0s, update flags.  */
150        lsls    r2, r2, #31
151
152        /* Copy byte by byte.
153           Condition ne means the last bit of r2 is 0.
154           Condition cs means the second to last bit of r2 is set,
155           i.e., r2 is 1 or 3.  */
156        itt     ne
157        ldrbne  r3, [r1], #1
158        strbne  r3, [r0], #1
159
160        itttt   cs
161        ldrbcs  r4, [r1], #1
162        ldrbcs  r5, [r1]
163        strbcs  r4, [r0], #1
164        strbcs  r5, [r0]
165
166return:
167        /* Restore registers: optimized pop {r0, r4, r5, pc}   */
168        ldrd r4, r5, [sp], #8
169        pop {r0, pc}         /* This is the only return point of memcpy.  */
170
171dst_not_word_aligned:
172
173       /* Get here when dst is not aligned and ip has the last 2 bits of dst,
174          i.e., ip is the offset of dst from word.
175          The number of bytes that remains to copy is r2 + 4,
176          i.e., there are at least 4 bytes to copy.
177          Write a partial word (0 to 3 bytes), such that dst becomes
178	  word-aligned.  */
179
180       /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
181          then there are (4 - ip) bytes to fill up to align dst to the next
182	  word.  */
183        rsb     ip, ip, #4                 /* ip = #4 - ip.  */
184        cmp     ip, #2
185
186       /* Copy byte by byte with conditionals.  */
187        itt     gt
188        ldrbgt  r3, [r1], #1
189        strbgt  r3, [r0], #1
190
191        itt     ge
192        ldrbge  r4, [r1], #1
193        strbge  r4, [r0], #1
194
195        ldrb    lr, [r1], #1
196        strb    lr, [r0], #1
197
198       /* Update the count.
199          ip holds the number of bytes we have just copied.  */
200        subs    r2, r2, ip                        /* r2 = r2 - ip.  */
201        blt     copy_less_than_4                  /* If r2 < ip.  */
202
203       /* Get here if there are more than 4 bytes to copy.
204          Check if src is aligned.  If beforehand src and dst were not word
205	  aligned but congruent (same offset), then now they are both
206	  word-aligned, and we can copy the rest efficiently (without
207	  shifting).  */
208        ands    ip, r1, #3                    /* ip = last 2 bits of src.  */
209        beq     word_aligned                  /* If r1 is word-aligned.  */
210
211src_not_word_aligned:
212       /* Get here when src is not word-aligned, but dst is word-aligned.
213          The number of bytes that remains to copy is r2+4.  */
214
215       /* Copy word by word using LDR when alignment can be done in hardware,
216          i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
217        subs    r2, r2, #60
218        blt     8f
219
2207:
221        /* Copy 64 bytes in every loop iteration.  */
222        .irp    offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
223        ldr     r3, [r1, \offset]
224        str     r3, [r0, \offset]
225        .endr
226
227        add     r0, r0, #64
228        add     r1, r1, #64
229        subs    r2, r2, #64
230        bge     7b
231
2328:
233        /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
234           Check if there is more than 3 bytes to copy.  */
235        adds    r2, r2, #60
236        blt     copy_less_than_4
237
2389:
239       /* Get here if there is less than 64 but at least 4 bytes to copy,
240          where the number of bytes to copy is r2+4.  */
241        ldr     r3, [r1], #4
242        str     r3, [r0], #4
243        subs    r2, r2, #4
244        bge     9b
245
246        b       copy_less_than_4
247
248
249	.syntax unified
250	.global __aeabi_memcpy4
251	.type   __aeabi_memcpy4, %function
252__aeabi_memcpy4:
253	/* Assumes that both of its arguments are 4-byte aligned.  */
254
255        push {r0, lr}
256        strd r4, r5, [sp, #-8]!
257
258        /* Is there at least 4 bytes to copy?  */
259        subs    r2, r2, #4
260        blt     copy_less_than_4       /* If n < 4.  */
261
262	bl	word_aligned
263
264	.syntax unified
265	.global __aeabi_memcpy8
266	.type   __aeabi_memcpy8, %function
267__aeabi_memcpy8:
268	/* Assumes that both of its arguments are 8-byte aligned.  */
269
270        push {r0, lr}
271        strd r4, r5, [sp, #-8]!
272
273	/* Is there at least 4 bytes to copy?  */
274        subs    r2, r2, #4
275        blt     copy_less_than_4	/* If n < 4.  */
276
277        /* Is there at least 8 bytes to copy?  */
278        subs    r2, r2, #4
279        blt     copy_less_than_8	/* If n < 8.  */
280
281	/* Is there at least 64 bytes to copy?  */
282	subs	r2, r2, #56
283	blt	copy_less_than_64	/* if n + 8 < 64.  */
284
285	bl	two_word_aligned
286
287#endif
288