1/* 2 * Copyright (c) 2014 ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "arm_asm.h" 30 31/* NOTE: This ifdef MUST match the one in aeabi_memcpy.c. */ 32#if !(defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED)) && \ 33 defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \ 34 (defined (__ARM_NEON__) || !defined (__SOFTFP__)) 35 36 .syntax unified 37 .global __aeabi_memcpy 38 .type __aeabi_memcpy, %function 39__aeabi_memcpy: 40 /* Assumes that n >= 0, and dst, src are valid pointers. 41 If there is at least 8 bytes to copy, use LDRD/STRD. 42 If src and dst are misaligned with different offsets, 43 first copy byte by byte until dst is aligned, 44 and then copy using LDRD/STRD and shift if needed. 45 When less than 8 left, copy a word and then byte by byte. */ 46 47 /* Save registers (r0 holds the return value): 48 optimized push {r0, r4, r5, lr}. 49 To try and improve performance, stack layout changed, 50 i.e., not keeping the stack looking like users expect 51 (highest numbered register at highest address). */ 52 push {r0, lr} 53 strd r4, r5, [sp, #-8]! 54 55 /* Get copying of tiny blocks out of the way first. */ 56 /* Is there at least 4 bytes to copy? */ 57 subs r2, r2, #4 58 blt copy_less_than_4 /* If n < 4. */ 59 60 /* Check word alignment. */ 61 ands ip, r0, #3 /* ip = last 2 bits of dst. */ 62 bne dst_not_word_aligned /* If dst is not word-aligned. */ 63 64 /* Get here if dst is word-aligned. */ 65 ands ip, r1, #3 /* ip = last 2 bits of src. */ 66 bne src_not_word_aligned /* If src is not word-aligned. */ 67word_aligned: 68 /* Get here if source and dst both are word-aligned. 69 The number of bytes remaining to copy is r2+4. */ 70 71 /* Is there is at least 64 bytes to copy? */ 72 subs r2, r2, #60 73 blt copy_less_than_64 /* If r2 + 4 < 64. */ 74 75 /* First, align the destination buffer to 8-bytes, 76 to make sure double loads and stores don't cross cache line boundary, 77 as they are then more expensive even if the data is in the cache 78 (require two load/store issue cycles instead of one). 79 If only one of the buffers is not 8-bytes aligned, 80 then it's more important to align dst than src, 81 because there is more penalty for stores 82 than loads that cross cacheline boundary. 83 This check and realignment are only worth doing 84 if there is a lot to copy. */ 85 86 /* Get here if dst is word aligned, 87 i.e., the 2 least significant bits are 0. 88 If dst is not 2w aligned (i.e., the 3rd bit is not set in dst), 89 then copy 1 word (4 bytes). */ 90 ands r3, r0, #4 91 beq two_word_aligned /* If dst already two-word aligned. */ 92 ldr r3, [r1], #4 93 str r3, [r0], #4 94 subs r2, r2, #4 95 blt copy_less_than_64 96 97two_word_aligned: 98 /* TODO: Align to cacheline (useful for PLD optimization). */ 99 100 /* Every loop iteration copies 64 bytes. */ 1011: 102 .irp offset, #0, #8, #16, #24, #32, #40, #48, #56 103 ldrd r4, r5, [r1, \offset] 104 strd r4, r5, [r0, \offset] 105 .endr 106 107 add r0, r0, #64 108 add r1, r1, #64 109 subs r2, r2, #64 110 bge 1b /* If there is more to copy. */ 111 112copy_less_than_64: 113 114 /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. 115 Restore the count if there is more than 7 bytes to copy. */ 116 adds r2, r2, #56 117 blt copy_less_than_8 118 119 /* Copy 8 bytes at a time. */ 1202: 121 ldrd r4, r5, [r1], #8 122 strd r4, r5, [r0], #8 123 subs r2, r2, #8 124 bge 2b /* If there is more to copy. */ 125 126copy_less_than_8: 127 128 /* Get here if less than 8 bytes to copy, -8 <= r2 < 0. 129 Check if there is more to copy. */ 130 cmn r2, #8 131 beq return /* If r2 + 8 == 0. */ 132 133 /* Restore the count if there is more than 3 bytes to copy. */ 134 adds r2, r2, #4 135 blt copy_less_than_4 136 137 /* Copy 4 bytes. */ 138 ldr r3, [r1], #4 139 str r3, [r0], #4 140 141copy_less_than_4: 142 /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */ 143 144 /* Restore the count, check if there is more to copy. */ 145 adds r2, r2, #4 146 beq return /* If r2 == 0. */ 147 148 /* Get here with r2 is in {1,2,3}={01,10,11}. */ 149 /* Logical shift left r2, insert 0s, update flags. */ 150 lsls r2, r2, #31 151 152 /* Copy byte by byte. 153 Condition ne means the last bit of r2 is 0. 154 Condition cs means the second to last bit of r2 is set, 155 i.e., r2 is 1 or 3. */ 156 itt ne 157 ldrbne r3, [r1], #1 158 strbne r3, [r0], #1 159 160 itttt cs 161 ldrbcs r4, [r1], #1 162 ldrbcs r5, [r1] 163 strbcs r4, [r0], #1 164 strbcs r5, [r0] 165 166return: 167 /* Restore registers: optimized pop {r0, r4, r5, pc} */ 168 ldrd r4, r5, [sp], #8 169 pop {r0, pc} /* This is the only return point of memcpy. */ 170 171dst_not_word_aligned: 172 173 /* Get here when dst is not aligned and ip has the last 2 bits of dst, 174 i.e., ip is the offset of dst from word. 175 The number of bytes that remains to copy is r2 + 4, 176 i.e., there are at least 4 bytes to copy. 177 Write a partial word (0 to 3 bytes), such that dst becomes 178 word-aligned. */ 179 180 /* If dst is at ip bytes offset from a word (with 0 < ip < 4), 181 then there are (4 - ip) bytes to fill up to align dst to the next 182 word. */ 183 rsb ip, ip, #4 /* ip = #4 - ip. */ 184 cmp ip, #2 185 186 /* Copy byte by byte with conditionals. */ 187 itt gt 188 ldrbgt r3, [r1], #1 189 strbgt r3, [r0], #1 190 191 itt ge 192 ldrbge r4, [r1], #1 193 strbge r4, [r0], #1 194 195 ldrb lr, [r1], #1 196 strb lr, [r0], #1 197 198 /* Update the count. 199 ip holds the number of bytes we have just copied. */ 200 subs r2, r2, ip /* r2 = r2 - ip. */ 201 blt copy_less_than_4 /* If r2 < ip. */ 202 203 /* Get here if there are more than 4 bytes to copy. 204 Check if src is aligned. If beforehand src and dst were not word 205 aligned but congruent (same offset), then now they are both 206 word-aligned, and we can copy the rest efficiently (without 207 shifting). */ 208 ands ip, r1, #3 /* ip = last 2 bits of src. */ 209 beq word_aligned /* If r1 is word-aligned. */ 210 211src_not_word_aligned: 212 /* Get here when src is not word-aligned, but dst is word-aligned. 213 The number of bytes that remains to copy is r2+4. */ 214 215 /* Copy word by word using LDR when alignment can be done in hardware, 216 i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ 217 subs r2, r2, #60 218 blt 8f 219 2207: 221 /* Copy 64 bytes in every loop iteration. */ 222 .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60 223 ldr r3, [r1, \offset] 224 str r3, [r0, \offset] 225 .endr 226 227 add r0, r0, #64 228 add r1, r1, #64 229 subs r2, r2, #64 230 bge 7b 231 2328: 233 /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. 234 Check if there is more than 3 bytes to copy. */ 235 adds r2, r2, #60 236 blt copy_less_than_4 237 2389: 239 /* Get here if there is less than 64 but at least 4 bytes to copy, 240 where the number of bytes to copy is r2+4. */ 241 ldr r3, [r1], #4 242 str r3, [r0], #4 243 subs r2, r2, #4 244 bge 9b 245 246 b copy_less_than_4 247 248 249 .syntax unified 250 .global __aeabi_memcpy4 251 .type __aeabi_memcpy4, %function 252__aeabi_memcpy4: 253 /* Assumes that both of its arguments are 4-byte aligned. */ 254 255 push {r0, lr} 256 strd r4, r5, [sp, #-8]! 257 258 /* Is there at least 4 bytes to copy? */ 259 subs r2, r2, #4 260 blt copy_less_than_4 /* If n < 4. */ 261 262 bl word_aligned 263 264 .syntax unified 265 .global __aeabi_memcpy8 266 .type __aeabi_memcpy8, %function 267__aeabi_memcpy8: 268 /* Assumes that both of its arguments are 8-byte aligned. */ 269 270 push {r0, lr} 271 strd r4, r5, [sp, #-8]! 272 273 /* Is there at least 4 bytes to copy? */ 274 subs r2, r2, #4 275 blt copy_less_than_4 /* If n < 4. */ 276 277 /* Is there at least 8 bytes to copy? */ 278 subs r2, r2, #4 279 blt copy_less_than_8 /* If n < 8. */ 280 281 /* Is there at least 64 bytes to copy? */ 282 subs r2, r2, #56 283 blt copy_less_than_64 /* if n + 8 < 64. */ 284 285 bl two_word_aligned 286 287#endif 288