1/* 2 * Copyright (c) 2013 ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without 30 unaligned access. 31 32 If compiled with GCC, this file should be enclosed within following 33 pre-processing check: 34 if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__) 35 36 Prototype: void *memcpy (void *dst, const void *src, size_t count); 37 38 The job will be done in 5 steps. 39 Step 1: Align src/dest pointers, copy mis-aligned if fail to align both 40 Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE 41 Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE 42 Step 4: Copy word by word 43 Step 5: Copy byte-to-byte 44 45 Tunable options: 46 __OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64. 47 __OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16. 48 */ 49#include <picolibc.h> 50 51#include "arm_asm.h" 52 53#ifndef __OPT_BIG_BLOCK_SIZE 54#define __OPT_BIG_BLOCK_SIZE (4 * 16) 55#endif 56 57#ifndef __OPT_MID_BLOCK_SIZE 58#define __OPT_MID_BLOCK_SIZE (4 * 4) 59#endif 60 61#if __OPT_BIG_BLOCK_SIZE == 16 62#define BEGIN_UNROLL_BIG_BLOCK \ 63 .irp offset, 0,4,8,12 64#elif __OPT_BIG_BLOCK_SIZE == 32 65#define BEGIN_UNROLL_BIG_BLOCK \ 66 .irp offset, 0,4,8,12,16,20,24,28 67#elif __OPT_BIG_BLOCK_SIZE == 64 68#define BEGIN_UNROLL_BIG_BLOCK \ 69 .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60 70#else 71#error "Illegal __OPT_BIG_BLOCK_SIZE" 72#endif 73 74#if __OPT_MID_BLOCK_SIZE == 8 75#define BEGIN_UNROLL_MID_BLOCK \ 76 .irp offset, 0,4 77#elif __OPT_MID_BLOCK_SIZE == 16 78#define BEGIN_UNROLL_MID_BLOCK \ 79 .irp offset, 0,4,8,12 80#else 81#error "Illegal __OPT_MID_BLOCK_SIZE" 82#endif 83 84#define END_UNROLL .endr 85 86 .syntax unified 87 .text 88 .align 2 89 .global memcpy 90 .thumb 91 .thumb_func 92 .fnstart 93 .cfi_sections .debug_frame 94 .cfi_startproc 95 .type memcpy, %function 96 ASM_ALIAS __aeabi_memcpy, memcpy 97 ASM_ALIAS __aeabi_memcpy4, memcpy 98 ASM_ALIAS __aeabi_memcpy8, memcpy 99memcpy: 100 // r0: dst 101 // r1: src 102 // r2: len 103#ifdef __ARM_FEATURE_UNALIGNED 104 /* In case of UNALIGNED access supported, ip is not used in 105 function body. */ 106 prologue push_ip=HAVE_PAC_LEAF 107 mov ip, r0 108#else 109 prologue 0 push_ip=HAVE_PAC_LEAF 110#endif /* __ARM_FEATURE_UNALIGNED */ 111 orr r3, r1, r0 112 ands r3, r3, #3 113 bne .Lmisaligned_copy 114 115.Lbig_block: 116 subs r2, __OPT_BIG_BLOCK_SIZE 117 blo .Lmid_block 118 119 /* Kernel loop for big block copy */ 120 .align 2 121.Lbig_block_loop: 122 BEGIN_UNROLL_BIG_BLOCK 123#ifdef __ARM_ARCH_7EM__ 124 ldr r3, [r1], #4 125 str r3, [r0], #4 126 END_UNROLL 127#else /* __ARM_ARCH_7M__ */ 128 ldr r3, [r1, \offset] 129 str r3, [r0, \offset] 130 END_UNROLL 131 adds r0, __OPT_BIG_BLOCK_SIZE 132 adds r1, __OPT_BIG_BLOCK_SIZE 133#endif 134 subs r2, __OPT_BIG_BLOCK_SIZE 135 bhs .Lbig_block_loop 136 137.Lmid_block: 138 adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE 139 blo .Lcopy_word_by_word 140 141 /* Kernel loop for mid-block copy */ 142 .align 2 143.Lmid_block_loop: 144 BEGIN_UNROLL_MID_BLOCK 145#ifdef __ARM_ARCH_7EM__ 146 ldr r3, [r1], #4 147 str r3, [r0], #4 148 END_UNROLL 149#else /* __ARM_ARCH_7M__ */ 150 ldr r3, [r1, \offset] 151 str r3, [r0, \offset] 152 END_UNROLL 153 adds r0, __OPT_MID_BLOCK_SIZE 154 adds r1, __OPT_MID_BLOCK_SIZE 155#endif 156 subs r2, __OPT_MID_BLOCK_SIZE 157 bhs .Lmid_block_loop 158 159.Lcopy_word_by_word: 160 adds r2, __OPT_MID_BLOCK_SIZE - 4 161 blo .Lcopy_less_than_4 162 163 /* Kernel loop for small block copy */ 164 .align 2 165.Lcopy_word_by_word_loop: 166 ldr r3, [r1], #4 167 str r3, [r0], #4 168 subs r2, #4 169 bhs .Lcopy_word_by_word_loop 170 171.Lcopy_less_than_4: 172 adds r2, #4 173 beq .Ldone 174 175 lsls r2, r2, #31 176 itt ne 177 ldrbne r3, [r1], #1 178 strbne r3, [r0], #1 179 180 bcc .Ldone 181#ifdef __ARM_FEATURE_UNALIGNED 182 ldrh r3, [r1] 183 strh r3, [r0] 184#else 185 ldrb r3, [r1] 186 strb r3, [r0] 187 ldrb r3, [r1, #1] 188 strb r3, [r0, #1] 189#endif /* __ARM_FEATURE_UNALIGNED */ 190 191.Ldone: 192 .cfi_remember_state 193#ifdef __ARM_FEATURE_UNALIGNED 194 mov r0, ip 195 epilogue push_ip=HAVE_PAC_LEAF 196#else 197 epilogue 0 push_ip=HAVE_PAC_LEAF 198#endif /* __ARM_FEATURE_UNALIGNED */ 199 200 .align 2 201.Lmisaligned_copy: 202 .cfi_restore_state 203#ifdef __ARM_FEATURE_UNALIGNED 204 /* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy 205 once destination is adjusted to aligned. */ 206#define Ldst_aligned Lbig_block 207 208 /* Copy word by word using LDR when alignment can be done in hardware, 209 i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ 210 211 cmp r2, #8 212 blo .Lbyte_copy 213 214 /* if src is aligned, just go to the big block loop. */ 215 lsls r3, r1, #30 216 beq .Ldst_aligned 217#else 218 /* if len < 12, misalignment adjustment has more overhead than 219 just byte-to-byte copy. Also, len must >=8 to guarantee code 220 afterward work correctly. */ 221 cmp r2, #12 222 blo .Lbyte_copy 223#endif /* __ARM_FEATURE_UNALIGNED */ 224 225 /* Align dst only, not trying to align src. That is the because 226 handling of aligned src and misaligned dst need more overhead than 227 otherwise. By doing this the worst case is when initial src is aligned, 228 additional up to 4 byte additional copy will executed, which is 229 acceptable. */ 230 231 ands r3, r0, #3 232 beq .Ldst_aligned 233 234 rsb r3, #4 235 subs r2, r3 236 237 lsls r3, r3, #31 238 itt ne 239 ldrbne r3, [r1], #1 240 strbne r3, [r0], #1 241 242 bcc .Ldst_aligned 243 244#ifdef __ARM_FEATURE_UNALIGNED 245 ldrh r3, [r1], #2 246 strh r3, [r0], #2 247 b .Ldst_aligned 248#else 249 ldrb r3, [r1], #1 250 strb r3, [r0], #1 251 ldrb r3, [r1], #1 252 strb r3, [r0], #1 253 /* Now that dst is aligned */ 254.Ldst_aligned: 255 /* if r1 is aligned now, it means r0/r1 has the same misalignment, 256 and they are both aligned now. Go aligned copy. */ 257 ands r3, r1, #3 258 beq .Lbig_block 259 260 /* dst is aligned, but src isn't. Misaligned copy. */ 261 262 push {r4, r5} 263 .cfi_adjust_cfa_offset 8 264 .cfi_rel_offset 4, 0 265 .cfi_rel_offset 5, 4 266 subs r2, #4 267 268 /* Backward r1 by misaligned bytes, to make r1 aligned. 269 Since we need to restore r1 to unaligned address after the loop, 270 we need keep the offset bytes to ip and sub it from r1 afterward. */ 271 subs r1, r3 272 rsb ip, r3, #4 273 274 /* Pre-load on word */ 275 ldr r4, [r1], #4 276 277 cmp r3, #2 278 beq .Lmisaligned_copy_2_2 279 cmp r3, #3 280 beq .Lmisaligned_copy_3_1 281 282 .macro mis_src_copy shift 2831: 284#ifdef __ARM_BIG_ENDIAN 285 lsls r4, r4, \shift 286#else 287 lsrs r4, r4, \shift 288#endif 289 ldr r3, [r1], #4 290#ifdef __ARM_BIG_ENDIAN 291 lsrs r5, r3, 32-\shift 292#else 293 lsls r5, r3, 32-\shift 294#endif 295 orr r4, r4, r5 296 str r4, [r0], #4 297 mov r4, r3 298 subs r2, #4 299 bhs 1b 300 .endm 301 302.Lmisaligned_copy_1_3: 303 mis_src_copy shift=8 304 b .Lsrc_misaligned_tail 305 306.Lmisaligned_copy_3_1: 307 mis_src_copy shift=24 308 b .Lsrc_misaligned_tail 309 310.Lmisaligned_copy_2_2: 311 /* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */ 312 mis_src_copy shift=16 313 314.Lsrc_misaligned_tail: 315 adds r2, #4 316 subs r1, ip 317 pop {r4, r5} 318 .cfi_restore 4 319 .cfi_restore 5 320 .cfi_adjust_cfa_offset -8 321 322#endif /* __ARM_FEATURE_UNALIGNED */ 323 324.Lbyte_copy: 325 subs r2, #4 326 blo .Lcopy_less_than_4 327 328.Lbyte_copy_loop: 329 subs r2, #1 330 ldrb r3, [r1], #1 331 strb r3, [r0], #1 332 bhs .Lbyte_copy_loop 333 334 ldrb r3, [r1] 335 strb r3, [r0] 336 ldrb r3, [r1, #1] 337 strb r3, [r0, #1] 338 ldrb r3, [r1, #2] 339 strb r3, [r0, #2] 340 341#ifdef __ARM_FEATURE_UNALIGNED 342 mov r0, ip 343 epilogue push_ip=HAVE_PAC_LEAF 344#else 345 epilogue 0 push_ip=HAVE_PAC_LEAF 346#endif /* __ARM_FEATURE_UNALIGNED */ 347 .cfi_endproc 348 .cantunwind 349 .fnend 350 .size memcpy, .-memcpy 351