1/*
2 * Copyright (c) 2014 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <picolibc.h>
30
31#include "arm_asm.h"
32
33	.syntax unified
34	.global memcpy
35	.type   memcpy, %function
36	ASM_ALIAS __aeabi_memcpy, memcpy
37memcpy:
38	/* Assumes that n >= 0, and dst, src are valid pointers.
39          If there is at least 8 bytes to copy, use LDRD/STRD.
40          If src and dst are misaligned with different offsets,
41          first copy byte by byte until dst is aligned,
42          and then copy using LDRD/STRD and shift if needed.
43          When less than 8 left, copy a word and then byte by byte.  */
44
45       /* Save registers (r0 holds the return value):
46          optimized push {r0, r4, r5, lr}.
47          To try and improve performance, stack layout changed,
48          i.e., not keeping the stack looking like users expect
49          (highest numbered register at highest address).  */
50        push {r0, lr}
51        strd r4, r5, [sp, #-8]!
52
53        /* Get copying of tiny blocks out of the way first.  */
54        /* Is there at least 4 bytes to copy?  */
55        subs    r2, r2, #4
56        blt     copy_less_than_4       /* If n < 4.  */
57
58        /* Check word alignment.  */
59        ands    ip, r0, #3             /* ip = last 2 bits of dst.  */
60        bne     dst_not_word_aligned   /* If dst is not word-aligned.  */
61
62        /* Get here if dst is word-aligned.  */
63        ands    ip, r1, #3             /* ip = last 2 bits of src.  */
64        bne     src_not_word_aligned   /* If src is not word-aligned.  */
65word_aligned:
66        /* Get here if source and dst both are word-aligned.
67           The number of bytes remaining to copy is r2+4.  */
68
69        /* Is there is at least 64 bytes to copy?  */
70        subs    r2, r2, #60
71        blt     copy_less_than_64                /* If r2 + 4 < 64.  */
72
73        /* First, align the destination buffer to 8-bytes,
74           to make sure double loads and stores don't cross cache line boundary,
75           as they are then more expensive even if the data is in the cache
76           (require two load/store issue cycles instead of one).
77           If only one of the buffers is not 8-bytes aligned,
78           then it's more important to align dst than src,
79           because there is more penalty for stores
80           than loads that cross cacheline boundary.
81           This check and realignment are only worth doing
82           if there is a lot to copy.  */
83
84        /* Get here if dst is word aligned,
85           i.e., the 2 least significant bits are 0.
86           If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
87           then copy 1 word (4 bytes).  */
88        ands    r3, r0, #4
89        beq     two_word_aligned  /* If dst already two-word aligned.  */
90        ldr     r3, [r1], #4
91        str     r3, [r0], #4
92        subs    r2, r2, #4
93        blt     copy_less_than_64
94
95two_word_aligned:
96        /* TODO: Align to cacheline (useful for PLD optimization).  */
97
98        /* Every loop iteration copies 64 bytes.  */
991:
100        .irp    offset, #0, #8, #16, #24, #32, #40, #48, #56
101        ldrd    r4, r5, [r1, \offset]
102        strd    r4, r5, [r0, \offset]
103        .endr
104
105        add     r0, r0, #64
106        add     r1, r1, #64
107        subs    r2, r2, #64
108        bge     1b                     /* If there is more to copy.  */
109
110copy_less_than_64:
111
112        /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
113           Restore the count if there is more than 7 bytes to copy.  */
114        adds    r2, r2, #56
115        blt     copy_less_than_8
116
117        /* Copy 8 bytes at a time.  */
1182:
119        ldrd    r4, r5, [r1], #8
120        strd    r4, r5, [r0], #8
121        subs    r2, r2, #8
122        bge     2b                     /* If there is more to copy.  */
123
124copy_less_than_8:
125
126        /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
127           Check if there is more to copy.  */
128        cmn     r2, #8
129        beq     return                          /* If r2 + 8 == 0.  */
130
131        /* Restore the count if there is more than 3 bytes to copy.  */
132        adds    r2, r2, #4
133        blt     copy_less_than_4
134
135        /* Copy 4 bytes.  */
136        ldr     r3, [r1], #4
137        str     r3, [r0], #4
138
139copy_less_than_4:
140        /* Get here if less than 4 bytes to copy, -4 <= r2 < 0.  */
141
142        /* Restore the count, check if there is more to copy.  */
143        adds    r2, r2, #4
144        beq     return                          /* If r2 == 0.  */
145
146        /* Get here with r2 is in {1,2,3}={01,10,11}.  */
147        /* Logical shift left r2, insert 0s, update flags.  */
148        lsls    r2, r2, #31
149
150        /* Copy byte by byte.
151           Condition ne means the last bit of r2 is 0.
152           Condition cs means the second to last bit of r2 is set,
153           i.e., r2 is 1 or 3.  */
154        itt     ne
155        ldrbne  r3, [r1], #1
156        strbne  r3, [r0], #1
157
158        itttt   cs
159        ldrbcs  r4, [r1], #1
160        ldrbcs  r5, [r1]
161        strbcs  r4, [r0], #1
162        strbcs  r5, [r0]
163
164return:
165        /* Restore registers: optimized pop {r0, r4, r5, pc}   */
166        ldrd r4, r5, [sp], #8
167        pop {r0, pc}         /* This is the only return point of memcpy.  */
168
169dst_not_word_aligned:
170
171       /* Get here when dst is not aligned and ip has the last 2 bits of dst,
172          i.e., ip is the offset of dst from word.
173          The number of bytes that remains to copy is r2 + 4,
174          i.e., there are at least 4 bytes to copy.
175          Write a partial word (0 to 3 bytes), such that dst becomes
176	  word-aligned.  */
177
178       /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
179          then there are (4 - ip) bytes to fill up to align dst to the next
180	  word.  */
181        rsb     ip, ip, #4                 /* ip = #4 - ip.  */
182        cmp     ip, #2
183
184       /* Copy byte by byte with conditionals.  */
185        itt     gt
186        ldrbgt  r3, [r1], #1
187        strbgt  r3, [r0], #1
188
189        itt     ge
190        ldrbge  r4, [r1], #1
191        strbge  r4, [r0], #1
192
193        ldrb    lr, [r1], #1
194        strb    lr, [r0], #1
195
196       /* Update the count.
197          ip holds the number of bytes we have just copied.  */
198        subs    r2, r2, ip                        /* r2 = r2 - ip.  */
199        blt     copy_less_than_4                  /* If r2 < ip.  */
200
201       /* Get here if there are more than 4 bytes to copy.
202          Check if src is aligned.  If beforehand src and dst were not word
203	  aligned but congruent (same offset), then now they are both
204	  word-aligned, and we can copy the rest efficiently (without
205	  shifting).  */
206        ands    ip, r1, #3                    /* ip = last 2 bits of src.  */
207        beq     word_aligned                  /* If r1 is word-aligned.  */
208
209src_not_word_aligned:
210       /* Get here when src is not word-aligned, but dst is word-aligned.
211          The number of bytes that remains to copy is r2+4.  */
212
213       /* Copy word by word using LDR when alignment can be done in hardware,
214          i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
215        subs    r2, r2, #60
216        blt     8f
217
2187:
219        /* Copy 64 bytes in every loop iteration.  */
220        .irp    offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
221        ldr     r3, [r1, \offset]
222        str     r3, [r0, \offset]
223        .endr
224
225        add     r0, r0, #64
226        add     r1, r1, #64
227        subs    r2, r2, #64
228        bge     7b
229
2308:
231        /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
232           Check if there is more than 3 bytes to copy.  */
233        adds    r2, r2, #60
234        blt     copy_less_than_4
235
2369:
237       /* Get here if there is less than 64 but at least 4 bytes to copy,
238          where the number of bytes to copy is r2+4.  */
239        ldr     r3, [r1], #4
240        str     r3, [r0], #4
241        subs    r2, r2, #4
242        bge     9b
243
244        b       copy_less_than_4
245
246
247	.syntax unified
248	.global __aeabi_memcpy4
249	.type   __aeabi_memcpy4, %function
250__aeabi_memcpy4:
251	/* Assumes that both of its arguments are 4-byte aligned.  */
252
253        push {r0, lr}
254        strd r4, r5, [sp, #-8]!
255
256        /* Is there at least 4 bytes to copy?  */
257        subs    r2, r2, #4
258        blt     copy_less_than_4       /* If n < 4.  */
259
260	bl	word_aligned
261
262	.syntax unified
263	.global __aeabi_memcpy8
264	.type   __aeabi_memcpy8, %function
265__aeabi_memcpy8:
266	/* Assumes that both of its arguments are 8-byte aligned.  */
267
268        push {r0, lr}
269        strd r4, r5, [sp, #-8]!
270
271	/* Is there at least 4 bytes to copy?  */
272        subs    r2, r2, #4
273        blt     copy_less_than_4	/* If n < 4.  */
274
275        /* Is there at least 8 bytes to copy?  */
276        subs    r2, r2, #4
277        blt     copy_less_than_8	/* If n < 8.  */
278
279	/* Is there at least 64 bytes to copy?  */
280	subs	r2, r2, #56
281	blt	copy_less_than_64	/* if n + 8 < 64.  */
282
283	bl	two_word_aligned
284