1/* 2 * Copyright 2022 NXP 3 * All rights reserved. 4 * 5 * SPDX-License-Identifier: BSD-3-Clause 6 */ 7 8 .syntax unified 9 10 .text 11 .thumb 12 13 .align 2 14 15#ifndef MSDK_MISC_OVERRIDE_MEMCPY 16#define MSDK_MISC_OVERRIDE_MEMCPY 1 17#endif 18 19/* 20 This mempcy function is used to replace the GCC newlib function for these purposes: 21 1. The newlib nano memcpy function use byte by byte copy, it is slow. 22 2. The newlib memcpy function for CM4, CM7, CM33 does't check address alignment, 23 so it may run to fault when the address is unaligned, and the memory region 24 is device memory, which does not support unaligned access. 25 26 This function is manually optimized base on assembly result of the c function. 27 The workflow is: 28 1. Return directly if length is 0. 29 2. If the source address is not 4-byte aligned, copy the unaligned part first byte by byte. 30 3. If the destination address is 4-byte aligned, then copy the 16-byte aligned part first, 31 copy 16-byte each loop, and then copy 8-byte, 4-byte, 2-byte and 1-byte. 32 4. If the destination address is not 4-byte aligned, load source data into register word 33 by word first, then store to memory based on alignement requirement. For the left part, 34 copy them byte by byte. 35 36 The source code of the c function is: 37 38 #define __CPY_WORD(dst, src) \ 39 *(uint32_t *)(dst) = *(uint32_t *)(src); \ 40 (dst) = ((uint32_t *)dst) + 1; \ 41 (src) = ((uint32_t *)src) + 1 42 43 #define __CPY_HWORD(dst, src) \ 44 *(uint16_t *)(dst) = *(uint16_t *)(src); \ 45 (dst) = ((uint16_t *)dst) + 1; \ 46 (src) = ((uint16_t *)src) + 1 47 48 #define __CPY_BYTE(dst, src) \ 49 *(uint8_t *)(dst) = *(uint8_t *)(src); \ 50 (dst) = ((uint8_t *)dst) + 1; \ 51 (src) = ((uint8_t *)src) + 1 52 53 void * memcpy(void *restrict dst, const void * restrict src, size_t n) 54 { 55 void *ret = dst; 56 uint32_t tmp; 57 58 if (0 == n) return ret; 59 60 while (((uintptr_t)src & 0x03UL) != 0UL) 61 { 62 __CPY_BYTE(dst, src); 63 n--; 64 65 if (0 == n) return ret; 66 } 67 68 if (((uintptr_t)dst & 0x03UL) == 0UL) 69 { 70 while (n >= 16UL) 71 { 72 __CPY_WORD(dst, src); 73 __CPY_WORD(dst, src); 74 __CPY_WORD(dst, src); 75 __CPY_WORD(dst, src); 76 n-= 16UL; 77 } 78 79 if ((n & 0x08UL) != 0UL) 80 { 81 __CPY_WORD(dst, src); 82 __CPY_WORD(dst, src); 83 } 84 85 if ((n & 0x04UL) != 0UL) 86 { 87 __CPY_WORD(dst, src); 88 } 89 90 if ((n & 0x02UL) != 0UL) 91 { 92 __CPY_HWORD(dst, src); 93 } 94 95 if ((n & 0x01UL) != 0UL) 96 { 97 __CPY_BYTE(dst, src); 98 } 99 } 100 else 101 { 102 if (((uintptr_t)dst & 1UL) == 0UL) 103 { 104 while (n >= 4) 105 { 106 tmp = *(uint32_t *)src; 107 src = ((uint32_t *)src) + 1; 108 109 *(volatile uint16_t *)dst = (uint16_t)tmp; 110 dst = ((uint16_t *)dst) + 1; 111 *(volatile uint16_t *)dst = (uint16_t)(tmp>>16U); 112 dst = ((uint16_t *)dst) + 1; 113 114 n-=4; 115 } 116 } 117 else 118 { 119 while (n >= 4) 120 { 121 tmp = *(uint32_t *)src; 122 src = ((uint32_t *)src) + 1; 123 124 *(volatile uint8_t *)dst = (uint8_t)tmp; 125 dst = ((uint8_t *)dst) + 1; 126 *(volatile uint16_t *)dst = (uint16_t)(tmp>>8U); 127 dst = ((uint16_t *)dst) + 1; 128 *(volatile uint8_t *)dst = (uint8_t)(tmp>>24U); 129 dst = ((uint8_t *)dst) + 1; 130 n-=4; 131 } 132 } 133 134 while (n > 0) 135 { 136 __CPY_BYTE(dst, src); 137 n--; 138 } 139 } 140 141 return ret; 142 } 143 144 The test function is: 145 146 void test_memcpy(uint8_t *dst, const uint8_t * src, size_t n) 147 { 148 uint8_t * ds; 149 uint8_t * de; 150 const uint8_t *ss; 151 const uint8_t *se; 152 uint8_t * ret; 153 154 for (ss = src; ss < src+n; ss++) 155 { 156 for (se = ss; se < src + n; se ++) 157 { 158 size_t nn = (uintptr_t)se - (uintptr_t)ss; 159 160 for (ds = dst; ds + nn < dst+n; ds++) 161 { 162 de = ds + nn; 163 164 memset(dst, 0, n); 165 166 ret = memcpy(ds, ss, nn); 167 168 assert(ret == ds); 169 170 for (const uint8_t *data = dst; data < ds; data++) 171 { 172 assert(0 == *data); 173 } 174 175 for (const uint8_t *data = de; data < dst+n; data++) 176 { 177 assert(0 == *data); 178 } 179 180 assert(memcmp(ds, ss, nn) == 0); 181 } 182 } 183 } 184 } 185 186 test_memcpy((uint8_t *)0x20240000, (const uint8_t *)0x202C0000, 48); 187 188 */ 189 190#if MSDK_MISC_OVERRIDE_MEMCPY 191 192 .thumb_func 193 .align 2 194 .global memcpy 195 .type memcpy, %function 196 197memcpy: 198 push {r0, r4, r5, r6, r7, lr} 199 cmp r2, #0 200 beq ret /* If copy size is 0, return. */ 201 202src_word_unaligned: 203 ands r3, r1, #3 /* Make src 4-byte align. */ 204 beq.n src_word_aligned /* src is 4-byte aligned, jump. */ 205 ldrb r4, [r1], #1 206 subs r2, r2, #1 /* n-- */ 207 strb r4, [r0], #1 208 beq.n ret /* n=0, return. */ 209 b.n src_word_unaligned 210 211src_word_aligned: 212 ands r3, r0, #3 /* Check dest 4-byte align. */ 213 bne.n dst_word_unaligned 214 215dst_word_aligned: 216 cmp r2, #16 217 blt.n size_ge_8 218size_ge_16: /* size greater or equal than 16, use ldm and stm. */ 219 subs r2, r2, #16 /* n -= 16 */ 220 ldmia r1!, { r4, r5, r6, r7 } 221 cmp r2, #16 222 stmia r0!, { r4, r5, r6, r7 } 223 bcs.n size_ge_16 224size_ge_8: /* size greater or equal than 8 */ 225 lsls r3, r2, #28 226 itt mi 227 ldmiami r1!, { r4, r5 } 228 stmiami r0!, { r4, r5 } 229size_ge_4: /* size greater or equal than 4 */ 230 lsls r3, r2, #29 231 itt mi 232 ldrmi r4, [r1], #4 233 strmi r4, [r0], #4 234size_ge_2: /* size greater or equal than 2 */ 235 lsls r3, r2, #30 236 itt mi 237 ldrhmi r4, [r1], #2 238 strhmi r4, [r0], #2 239size_ge_1: /* size greater or equal than 1 */ 240 lsls r3, r2, #31 241 itt mi 242 ldrbmi r4, [r1] 243 strbmi r4, [r0] 244 b.n ret 245 246dst_word_unaligned: 247 lsls r3, r0, #31 248 bmi.n dst_half_word_unaligned 249dst_half_word_aligned: 250 cmp r2, #4 251 bcc.n size_lt_4 252 ldr r4, [r1], #4 253 subs r2, r2, #4 254 strh r4, [r0], #2 255 lsrs r5, r4, #16 256 strh r5, [r0], #2 257 b dst_half_word_aligned 258dst_half_word_unaligned: 259 cmp r2, #4 260 bcc.n size_lt_4 261 ldr r4, [r1], #4 262 subs r2, r2, #4 263 strb r4, [r0], #1 264 lsrs r5, r4, #8 265 strh r5, [r0], #2 266 lsrs r6, r4, #24 267 strb r6, [r0], #1 268 b dst_half_word_unaligned 269size_lt_4: /* size less than 4. */ 270 cmp r2, #0 271 ittt ne 272 ldrbne r4, [r1], #1 273 strbne r4, [r0], #1 274 subne r2, r2, #1 275 bne size_lt_4 276ret: 277 pop {r0, r4, r5, r6, r7, pc} 278 279#endif /* MSDK_MISC_OVERRIDE_MEMCPY */ 280