1/* 2 * Copyright (c) 2013 ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without 30 unaligned access. 31 32 If compiled with GCC, this file should be enclosed within following 33 pre-processing check: 34 if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__) 35 36 Prototype: void *memcpy (void *dst, const void *src, size_t count); 37 38 The job will be done in 5 steps. 39 Step 1: Align src/dest pointers, copy mis-aligned if fail to align both 40 Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE 41 Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE 42 Step 4: Copy word by word 43 Step 5: Copy byte-to-byte 44 45 Tunable options: 46 __OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64. 47 __OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16. 48 */ 49#include "arm_asm.h" 50 51#ifndef __OPT_BIG_BLOCK_SIZE 52#define __OPT_BIG_BLOCK_SIZE (4 * 16) 53#endif 54 55#ifndef __OPT_MID_BLOCK_SIZE 56#define __OPT_MID_BLOCK_SIZE (4 * 4) 57#endif 58 59#if __OPT_BIG_BLOCK_SIZE == 16 60#define BEGIN_UNROLL_BIG_BLOCK \ 61 .irp offset, 0,4,8,12 62#elif __OPT_BIG_BLOCK_SIZE == 32 63#define BEGIN_UNROLL_BIG_BLOCK \ 64 .irp offset, 0,4,8,12,16,20,24,28 65#elif __OPT_BIG_BLOCK_SIZE == 64 66#define BEGIN_UNROLL_BIG_BLOCK \ 67 .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60 68#else 69#error "Illegal __OPT_BIG_BLOCK_SIZE" 70#endif 71 72#if __OPT_MID_BLOCK_SIZE == 8 73#define BEGIN_UNROLL_MID_BLOCK \ 74 .irp offset, 0,4 75#elif __OPT_MID_BLOCK_SIZE == 16 76#define BEGIN_UNROLL_MID_BLOCK \ 77 .irp offset, 0,4,8,12 78#else 79#error "Illegal __OPT_MID_BLOCK_SIZE" 80#endif 81 82#define END_UNROLL .endr 83 84.macro ASM_ALIAS new old 85 .global \new 86 .type \new, %function 87#if defined (__thumb__) 88 .thumb_set \new, \old 89#else 90 .set \new, \old 91#endif 92.endm 93 94 .syntax unified 95 .text 96 .align 2 97 .global memcpy 98 .thumb 99 .thumb_func 100 .fnstart 101 .cfi_startproc 102 .type memcpy, %function 103 ASM_ALIAS __aeabi_memcpy, memcpy 104 ASM_ALIAS __aeabi_memcpy4, memcpy 105 ASM_ALIAS __aeabi_memcpy8, memcpy 106memcpy: 107 @ r0: dst 108 @ r1: src 109 @ r2: len 110#ifdef __ARM_FEATURE_UNALIGNED 111 /* In case of UNALIGNED access supported, ip is not used in 112 function body. */ 113 prologue push_ip=HAVE_PAC_LEAF 114 mov ip, r0 115#else 116 prologue 0 push_ip=HAVE_PAC_LEAF 117#endif /* __ARM_FEATURE_UNALIGNED */ 118 orr r3, r1, r0 119 ands r3, r3, #3 120 bne .Lmisaligned_copy 121 122.Lbig_block: 123 subs r2, __OPT_BIG_BLOCK_SIZE 124 blo .Lmid_block 125 126 /* Kernel loop for big block copy */ 127 .align 2 128.Lbig_block_loop: 129 BEGIN_UNROLL_BIG_BLOCK 130#ifdef __ARM_ARCH_7EM__ 131 ldr r3, [r1], #4 132 str r3, [r0], #4 133 END_UNROLL 134#else /* __ARM_ARCH_7M__ */ 135 ldr r3, [r1, \offset] 136 str r3, [r0, \offset] 137 END_UNROLL 138 adds r0, __OPT_BIG_BLOCK_SIZE 139 adds r1, __OPT_BIG_BLOCK_SIZE 140#endif 141 subs r2, __OPT_BIG_BLOCK_SIZE 142 bhs .Lbig_block_loop 143 144.Lmid_block: 145 adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE 146 blo .Lcopy_word_by_word 147 148 /* Kernel loop for mid-block copy */ 149 .align 2 150.Lmid_block_loop: 151 BEGIN_UNROLL_MID_BLOCK 152#ifdef __ARM_ARCH_7EM__ 153 ldr r3, [r1], #4 154 str r3, [r0], #4 155 END_UNROLL 156#else /* __ARM_ARCH_7M__ */ 157 ldr r3, [r1, \offset] 158 str r3, [r0, \offset] 159 END_UNROLL 160 adds r0, __OPT_MID_BLOCK_SIZE 161 adds r1, __OPT_MID_BLOCK_SIZE 162#endif 163 subs r2, __OPT_MID_BLOCK_SIZE 164 bhs .Lmid_block_loop 165 166.Lcopy_word_by_word: 167 adds r2, __OPT_MID_BLOCK_SIZE - 4 168 blo .Lcopy_less_than_4 169 170 /* Kernel loop for small block copy */ 171 .align 2 172.Lcopy_word_by_word_loop: 173 ldr r3, [r1], #4 174 str r3, [r0], #4 175 subs r2, #4 176 bhs .Lcopy_word_by_word_loop 177 178.Lcopy_less_than_4: 179 adds r2, #4 180 beq .Ldone 181 182 lsls r2, r2, #31 183 itt ne 184 ldrbne r3, [r1], #1 185 strbne r3, [r0], #1 186 187 bcc .Ldone 188#ifdef __ARM_FEATURE_UNALIGNED 189 ldrh r3, [r1] 190 strh r3, [r0] 191#else 192 ldrb r3, [r1] 193 strb r3, [r0] 194 ldrb r3, [r1, #1] 195 strb r3, [r0, #1] 196#endif /* __ARM_FEATURE_UNALIGNED */ 197 198.Ldone: 199 .cfi_remember_state 200#ifdef __ARM_FEATURE_UNALIGNED 201 mov r0, ip 202 epilogue push_ip=HAVE_PAC_LEAF 203#else 204 epilogue 0 push_ip=HAVE_PAC_LEAF 205#endif /* __ARM_FEATURE_UNALIGNED */ 206 207 .align 2 208.Lmisaligned_copy: 209 .cfi_restore_state 210#ifdef __ARM_FEATURE_UNALIGNED 211 /* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy 212 once destination is adjusted to aligned. */ 213#define Ldst_aligned Lbig_block 214 215 /* Copy word by word using LDR when alignment can be done in hardware, 216 i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ 217 218 cmp r2, #8 219 blo .Lbyte_copy 220 221 /* if src is aligned, just go to the big block loop. */ 222 lsls r3, r1, #30 223 beq .Ldst_aligned 224#else 225 /* if len < 12, misalignment adjustment has more overhead than 226 just byte-to-byte copy. Also, len must >=8 to guarantee code 227 afterward work correctly. */ 228 cmp r2, #12 229 blo .Lbyte_copy 230#endif /* __ARM_FEATURE_UNALIGNED */ 231 232 /* Align dst only, not trying to align src. That is the because 233 handling of aligned src and misaligned dst need more overhead than 234 otherwise. By doing this the worst case is when initial src is aligned, 235 additional up to 4 byte additional copy will executed, which is 236 acceptable. */ 237 238 ands r3, r0, #3 239 beq .Ldst_aligned 240 241 rsb r3, #4 242 subs r2, r3 243 244 lsls r3, r3, #31 245 itt ne 246 ldrbne r3, [r1], #1 247 strbne r3, [r0], #1 248 249 bcc .Ldst_aligned 250 251#ifdef __ARM_FEATURE_UNALIGNED 252 ldrh r3, [r1], #2 253 strh r3, [r0], #2 254 b .Ldst_aligned 255#else 256 ldrb r3, [r1], #1 257 strb r3, [r0], #1 258 ldrb r3, [r1], #1 259 strb r3, [r0], #1 260 /* Now that dst is aligned */ 261.Ldst_aligned: 262 /* if r1 is aligned now, it means r0/r1 has the same misalignment, 263 and they are both aligned now. Go aligned copy. */ 264 ands r3, r1, #3 265 beq .Lbig_block 266 267 /* dst is aligned, but src isn't. Misaligned copy. */ 268 269 push {r4, r5} 270 .cfi_adjust_cfa_offset 8 271 .cfi_rel_offset 4, 0 272 .cfi_rel_offset 5, 4 273 subs r2, #4 274 275 /* Backward r1 by misaligned bytes, to make r1 aligned. 276 Since we need to restore r1 to unaligned address after the loop, 277 we need keep the offset bytes to ip and sub it from r1 afterward. */ 278 subs r1, r3 279 rsb ip, r3, #4 280 281 /* Pre-load on word */ 282 ldr r4, [r1], #4 283 284 cmp r3, #2 285 beq .Lmisaligned_copy_2_2 286 cmp r3, #3 287 beq .Lmisaligned_copy_3_1 288 289 .macro mis_src_copy shift 2901: 291#ifdef __ARM_BIG_ENDIAN 292 lsls r4, r4, \shift 293#else 294 lsrs r4, r4, \shift 295#endif 296 ldr r3, [r1], #4 297#ifdef __ARM_BIG_ENDIAN 298 lsrs r5, r3, 32-\shift 299#else 300 lsls r5, r3, 32-\shift 301#endif 302 orr r4, r4, r5 303 str r4, [r0], #4 304 mov r4, r3 305 subs r2, #4 306 bhs 1b 307 .endm 308 309.Lmisaligned_copy_1_3: 310 mis_src_copy shift=8 311 b .Lsrc_misaligned_tail 312 313.Lmisaligned_copy_3_1: 314 mis_src_copy shift=24 315 b .Lsrc_misaligned_tail 316 317.Lmisaligned_copy_2_2: 318 /* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */ 319 mis_src_copy shift=16 320 321.Lsrc_misaligned_tail: 322 adds r2, #4 323 subs r1, ip 324 pop {r4, r5} 325 .cfi_restore 4 326 .cfi_restore 5 327 .cfi_adjust_cfa_offset -8 328 329#endif /* __ARM_FEATURE_UNALIGNED */ 330 331.Lbyte_copy: 332 subs r2, #4 333 blo .Lcopy_less_than_4 334 335.Lbyte_copy_loop: 336 subs r2, #1 337 ldrb r3, [r1], #1 338 strb r3, [r0], #1 339 bhs .Lbyte_copy_loop 340 341 ldrb r3, [r1] 342 strb r3, [r0] 343 ldrb r3, [r1, #1] 344 strb r3, [r0, #1] 345 ldrb r3, [r1, #2] 346 strb r3, [r0, #2] 347 348#ifdef __ARM_FEATURE_UNALIGNED 349 mov r0, ip 350 epilogue push_ip=HAVE_PAC_LEAF 351#else 352 epilogue 0 push_ip=HAVE_PAC_LEAF 353#endif /* __ARM_FEATURE_UNALIGNED */ 354 .cfi_endproc 355 .cantunwind 356 .fnend 357 .size memcpy, .-memcpy 358