1/*
2 * Copyright 2022 NXP
3 * All rights reserved.
4 *
5 * SPDX-License-Identifier: BSD-3-Clause
6 */
7
8    .syntax unified
9
10    .text
11    .thumb
12
13    .align 2
14
15#ifndef MSDK_MISC_OVERRIDE_MEMCPY
16#define MSDK_MISC_OVERRIDE_MEMCPY 1
17#endif
18
19/*
20   This mempcy function is used to replace the GCC newlib function for these purposes:
21   1. The newlib nano memcpy function use byte by byte copy, it is slow.
22   2. The newlib memcpy function for CM4, CM7, CM33 does't check address alignment,
23      so it may run to fault when the address is unaligned, and the memory region
24      is device memory, which does not support unaligned access.
25
26   This function is manually optimized base on assembly result of the c function.
27   The workflow is:
28   1. Return directly if length is 0.
29   2. If the source address is not 4-byte aligned, copy the unaligned part first byte by byte.
30   3. If the destination address is 4-byte aligned, then copy the 16-byte aligned part first,
31      copy 16-byte each loop, and then copy 8-byte, 4-byte, 2-byte and 1-byte.
32   4. If the destination address is not 4-byte aligned, load source data into register word
33      by word first, then store to memory based on alignement requirement. For the left part,
34      copy them byte by byte.
35
36   The source code of the c function is:
37
38   #define __CPY_WORD(dst, src) \
39       *(uint32_t *)(dst) = *(uint32_t *)(src); \
40       (dst) = ((uint32_t *)dst) + 1; \
41       (src) = ((uint32_t *)src) + 1
42
43   #define __CPY_HWORD(dst, src) \
44       *(uint16_t *)(dst) = *(uint16_t *)(src); \
45       (dst) = ((uint16_t *)dst) + 1; \
46       (src) = ((uint16_t *)src) + 1
47
48   #define __CPY_BYTE(dst, src) \
49       *(uint8_t *)(dst) = *(uint8_t *)(src); \
50       (dst) = ((uint8_t *)dst) + 1; \
51       (src) = ((uint8_t *)src) + 1
52
53   void * memcpy(void *restrict  dst, const void * restrict src, size_t n)
54   {
55       void *ret = dst;
56       uint32_t tmp;
57
58       if (0 == n) return ret;
59
60       while (((uintptr_t)src & 0x03UL) != 0UL)
61       {
62           __CPY_BYTE(dst, src);
63           n--;
64
65           if (0 == n) return ret;
66       }
67
68       if (((uintptr_t)dst & 0x03UL) == 0UL)
69       {
70           while (n >= 16UL)
71           {
72               __CPY_WORD(dst, src);
73               __CPY_WORD(dst, src);
74               __CPY_WORD(dst, src);
75               __CPY_WORD(dst, src);
76               n-= 16UL;
77           }
78
79           if ((n & 0x08UL) != 0UL)
80           {
81               __CPY_WORD(dst, src);
82               __CPY_WORD(dst, src);
83           }
84
85           if ((n & 0x04UL) != 0UL)
86           {
87               __CPY_WORD(dst, src);
88           }
89
90           if ((n & 0x02UL) != 0UL)
91           {
92               __CPY_HWORD(dst, src);
93           }
94
95           if ((n & 0x01UL) != 0UL)
96           {
97               __CPY_BYTE(dst, src);
98           }
99       }
100       else
101       {
102           if (((uintptr_t)dst & 1UL) == 0UL)
103           {
104               while (n >= 4)
105               {
106                   tmp = *(uint32_t *)src;
107                   src = ((uint32_t *)src) + 1;
108
109                   *(volatile uint16_t *)dst = (uint16_t)tmp;
110                   dst = ((uint16_t *)dst) + 1;
111                   *(volatile uint16_t *)dst = (uint16_t)(tmp>>16U);
112                   dst = ((uint16_t *)dst) + 1;
113
114                   n-=4;
115               }
116           }
117           else
118           {
119               while (n >= 4)
120               {
121                   tmp = *(uint32_t *)src;
122                   src = ((uint32_t *)src) + 1;
123
124                   *(volatile uint8_t *)dst  = (uint8_t)tmp;
125                   dst = ((uint8_t *)dst) + 1;
126                   *(volatile uint16_t *)dst = (uint16_t)(tmp>>8U);
127                   dst = ((uint16_t *)dst) + 1;
128                   *(volatile uint8_t *)dst = (uint8_t)(tmp>>24U);
129                   dst = ((uint8_t *)dst) + 1;
130                   n-=4;
131               }
132           }
133
134           while (n > 0)
135           {
136               __CPY_BYTE(dst, src);
137               n--;
138           }
139       }
140
141       return ret;
142   }
143
144   The test function is:
145
146   void test_memcpy(uint8_t *dst, const uint8_t * src, size_t n)
147   {
148       uint8_t * ds;
149       uint8_t * de;
150       const uint8_t *ss;
151       const uint8_t *se;
152       uint8_t * ret;
153
154       for (ss = src; ss < src+n; ss++)
155       {
156           for (se = ss; se < src + n; se ++)
157           {
158               size_t nn = (uintptr_t)se - (uintptr_t)ss;
159
160               for (ds = dst; ds + nn < dst+n; ds++)
161               {
162                   de = ds + nn;
163
164                   memset(dst, 0, n);
165
166                   ret = memcpy(ds, ss, nn);
167
168                   assert(ret == ds);
169
170                   for (const uint8_t *data = dst; data < ds; data++)
171                   {
172                       assert(0 == *data);
173                   }
174
175                   for (const uint8_t *data = de; data < dst+n; data++)
176                   {
177                       assert(0 == *data);
178                   }
179
180                   assert(memcmp(ds, ss, nn) == 0);
181               }
182           }
183       }
184   }
185
186   test_memcpy((uint8_t *)0x20240000, (const uint8_t *)0x202C0000, 48);
187
188 */
189
190#if MSDK_MISC_OVERRIDE_MEMCPY
191
192    .thumb_func
193    .align 2
194    .global  memcpy
195    .type    memcpy, %function
196
197memcpy:
198    push    {r0, r4, r5, r6, r7, lr}
199    cmp     r2, #0
200    beq     ret                    /* If copy size is 0, return. */
201
202src_word_unaligned:
203    ands    r3, r1, #3             /* Make src 4-byte align. */
204    beq.n   src_word_aligned       /* src is 4-byte aligned, jump. */
205    ldrb    r4, [r1], #1
206    subs    r2, r2, #1             /* n-- */
207    strb    r4, [r0], #1
208    beq.n   ret                    /* n=0, return. */
209    b.n     src_word_unaligned
210
211src_word_aligned:
212    ands    r3, r0, #3             /* Check dest 4-byte align. */
213    bne.n   dst_word_unaligned
214
215dst_word_aligned:
216    cmp     r2, #16
217    blt.n   size_ge_8
218size_ge_16:                         /* size greater or equal than 16, use ldm and stm. */
219    subs    r2, r2, #16             /* n -= 16 */
220    ldmia   r1!, { r4, r5, r6, r7 }
221    cmp     r2, #16
222    stmia   r0!, { r4, r5, r6, r7 }
223    bcs.n   size_ge_16
224size_ge_8:                         /* size greater or equal than 8 */
225    lsls    r3, r2, #28
226    itt     mi
227    ldmiami r1!, { r4, r5 }
228    stmiami r0!, { r4, r5 }
229size_ge_4:                         /* size greater or equal than 4 */
230    lsls    r3, r2, #29
231    itt     mi
232    ldrmi   r4, [r1], #4
233    strmi   r4, [r0], #4
234size_ge_2:                         /* size greater or equal than 2 */
235    lsls    r3, r2, #30
236    itt     mi
237    ldrhmi  r4, [r1], #2
238    strhmi  r4, [r0], #2
239size_ge_1:                         /* size greater or equal than 1 */
240    lsls    r3, r2, #31
241    itt     mi
242    ldrbmi  r4, [r1]
243    strbmi  r4, [r0]
244    b.n     ret
245
246dst_word_unaligned:
247    lsls    r3, r0, #31
248    bmi.n   dst_half_word_unaligned
249dst_half_word_aligned:
250    cmp     r2, #4
251    bcc.n   size_lt_4
252    ldr     r4, [r1], #4
253    subs    r2, r2, #4
254    strh    r4, [r0], #2
255    lsrs    r5, r4, #16
256    strh    r5, [r0], #2
257    b  dst_half_word_aligned
258dst_half_word_unaligned:
259    cmp     r2, #4
260    bcc.n   size_lt_4
261    ldr     r4, [r1], #4
262    subs    r2, r2, #4
263    strb    r4, [r0], #1
264    lsrs    r5, r4, #8
265    strh    r5, [r0], #2
266    lsrs    r6, r4, #24
267    strb    r6, [r0], #1
268    b  dst_half_word_unaligned
269size_lt_4:                             /* size less than 4. */
270    cmp     r2, #0
271    ittt    ne
272    ldrbne  r4, [r1], #1
273    strbne  r4, [r0], #1
274    subne   r2, r2, #1
275    bne     size_lt_4
276ret:
277    pop    {r0, r4, r5, r6, r7, pc}
278
279#endif /* MSDK_MISC_OVERRIDE_MEMCPY */
280