1/* 2 * Copyright (c) 2014 ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include <picolibc.h> 30 31#include "arm_asm.h" 32 33 .syntax unified 34 .global memcpy 35 .type memcpy, %function 36 ASM_ALIAS __aeabi_memcpy, memcpy 37memcpy: 38 /* Assumes that n >= 0, and dst, src are valid pointers. 39 If there is at least 8 bytes to copy, use LDRD/STRD. 40 If src and dst are misaligned with different offsets, 41 first copy byte by byte until dst is aligned, 42 and then copy using LDRD/STRD and shift if needed. 43 When less than 8 left, copy a word and then byte by byte. */ 44 45 /* Save registers (r0 holds the return value): 46 optimized push {r0, r4, r5, lr}. 47 To try and improve performance, stack layout changed, 48 i.e., not keeping the stack looking like users expect 49 (highest numbered register at highest address). */ 50 push {r0, lr} 51 strd r4, r5, [sp, #-8]! 52 53 /* Get copying of tiny blocks out of the way first. */ 54 /* Is there at least 4 bytes to copy? */ 55 subs r2, r2, #4 56 blt copy_less_than_4 /* If n < 4. */ 57 58 /* Check word alignment. */ 59 ands ip, r0, #3 /* ip = last 2 bits of dst. */ 60 bne dst_not_word_aligned /* If dst is not word-aligned. */ 61 62 /* Get here if dst is word-aligned. */ 63 ands ip, r1, #3 /* ip = last 2 bits of src. */ 64 bne src_not_word_aligned /* If src is not word-aligned. */ 65word_aligned: 66 /* Get here if source and dst both are word-aligned. 67 The number of bytes remaining to copy is r2+4. */ 68 69 /* Is there is at least 64 bytes to copy? */ 70 subs r2, r2, #60 71 blt copy_less_than_64 /* If r2 + 4 < 64. */ 72 73 /* First, align the destination buffer to 8-bytes, 74 to make sure double loads and stores don't cross cache line boundary, 75 as they are then more expensive even if the data is in the cache 76 (require two load/store issue cycles instead of one). 77 If only one of the buffers is not 8-bytes aligned, 78 then it's more important to align dst than src, 79 because there is more penalty for stores 80 than loads that cross cacheline boundary. 81 This check and realignment are only worth doing 82 if there is a lot to copy. */ 83 84 /* Get here if dst is word aligned, 85 i.e., the 2 least significant bits are 0. 86 If dst is not 2w aligned (i.e., the 3rd bit is not set in dst), 87 then copy 1 word (4 bytes). */ 88 ands r3, r0, #4 89 beq two_word_aligned /* If dst already two-word aligned. */ 90 ldr r3, [r1], #4 91 str r3, [r0], #4 92 subs r2, r2, #4 93 blt copy_less_than_64 94 95two_word_aligned: 96 /* TODO: Align to cacheline (useful for PLD optimization). */ 97 98 /* Every loop iteration copies 64 bytes. */ 991: 100 .irp offset, #0, #8, #16, #24, #32, #40, #48, #56 101 ldrd r4, r5, [r1, \offset] 102 strd r4, r5, [r0, \offset] 103 .endr 104 105 add r0, r0, #64 106 add r1, r1, #64 107 subs r2, r2, #64 108 bge 1b /* If there is more to copy. */ 109 110copy_less_than_64: 111 112 /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. 113 Restore the count if there is more than 7 bytes to copy. */ 114 adds r2, r2, #56 115 blt copy_less_than_8 116 117 /* Copy 8 bytes at a time. */ 1182: 119 ldrd r4, r5, [r1], #8 120 strd r4, r5, [r0], #8 121 subs r2, r2, #8 122 bge 2b /* If there is more to copy. */ 123 124copy_less_than_8: 125 126 /* Get here if less than 8 bytes to copy, -8 <= r2 < 0. 127 Check if there is more to copy. */ 128 cmn r2, #8 129 beq return /* If r2 + 8 == 0. */ 130 131 /* Restore the count if there is more than 3 bytes to copy. */ 132 adds r2, r2, #4 133 blt copy_less_than_4 134 135 /* Copy 4 bytes. */ 136 ldr r3, [r1], #4 137 str r3, [r0], #4 138 139copy_less_than_4: 140 /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */ 141 142 /* Restore the count, check if there is more to copy. */ 143 adds r2, r2, #4 144 beq return /* If r2 == 0. */ 145 146 /* Get here with r2 is in {1,2,3}={01,10,11}. */ 147 /* Logical shift left r2, insert 0s, update flags. */ 148 lsls r2, r2, #31 149 150 /* Copy byte by byte. 151 Condition ne means the last bit of r2 is 0. 152 Condition cs means the second to last bit of r2 is set, 153 i.e., r2 is 1 or 3. */ 154 itt ne 155 ldrbne r3, [r1], #1 156 strbne r3, [r0], #1 157 158 itttt cs 159 ldrbcs r4, [r1], #1 160 ldrbcs r5, [r1] 161 strbcs r4, [r0], #1 162 strbcs r5, [r0] 163 164return: 165 /* Restore registers: optimized pop {r0, r4, r5, pc} */ 166 ldrd r4, r5, [sp], #8 167 pop {r0, pc} /* This is the only return point of memcpy. */ 168 169dst_not_word_aligned: 170 171 /* Get here when dst is not aligned and ip has the last 2 bits of dst, 172 i.e., ip is the offset of dst from word. 173 The number of bytes that remains to copy is r2 + 4, 174 i.e., there are at least 4 bytes to copy. 175 Write a partial word (0 to 3 bytes), such that dst becomes 176 word-aligned. */ 177 178 /* If dst is at ip bytes offset from a word (with 0 < ip < 4), 179 then there are (4 - ip) bytes to fill up to align dst to the next 180 word. */ 181 rsb ip, ip, #4 /* ip = #4 - ip. */ 182 cmp ip, #2 183 184 /* Copy byte by byte with conditionals. */ 185 itt gt 186 ldrbgt r3, [r1], #1 187 strbgt r3, [r0], #1 188 189 itt ge 190 ldrbge r4, [r1], #1 191 strbge r4, [r0], #1 192 193 ldrb lr, [r1], #1 194 strb lr, [r0], #1 195 196 /* Update the count. 197 ip holds the number of bytes we have just copied. */ 198 subs r2, r2, ip /* r2 = r2 - ip. */ 199 blt copy_less_than_4 /* If r2 < ip. */ 200 201 /* Get here if there are more than 4 bytes to copy. 202 Check if src is aligned. If beforehand src and dst were not word 203 aligned but congruent (same offset), then now they are both 204 word-aligned, and we can copy the rest efficiently (without 205 shifting). */ 206 ands ip, r1, #3 /* ip = last 2 bits of src. */ 207 beq word_aligned /* If r1 is word-aligned. */ 208 209src_not_word_aligned: 210 /* Get here when src is not word-aligned, but dst is word-aligned. 211 The number of bytes that remains to copy is r2+4. */ 212 213 /* Copy word by word using LDR when alignment can be done in hardware, 214 i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ 215 subs r2, r2, #60 216 blt 8f 217 2187: 219 /* Copy 64 bytes in every loop iteration. */ 220 .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60 221 ldr r3, [r1, \offset] 222 str r3, [r0, \offset] 223 .endr 224 225 add r0, r0, #64 226 add r1, r1, #64 227 subs r2, r2, #64 228 bge 7b 229 2308: 231 /* Get here if less than 64 bytes to copy, -64 <= r2 < 0. 232 Check if there is more than 3 bytes to copy. */ 233 adds r2, r2, #60 234 blt copy_less_than_4 235 2369: 237 /* Get here if there is less than 64 but at least 4 bytes to copy, 238 where the number of bytes to copy is r2+4. */ 239 ldr r3, [r1], #4 240 str r3, [r0], #4 241 subs r2, r2, #4 242 bge 9b 243 244 b copy_less_than_4 245 246 247 .syntax unified 248 .global __aeabi_memcpy4 249 .type __aeabi_memcpy4, %function 250__aeabi_memcpy4: 251 /* Assumes that both of its arguments are 4-byte aligned. */ 252 253 push {r0, lr} 254 strd r4, r5, [sp, #-8]! 255 256 /* Is there at least 4 bytes to copy? */ 257 subs r2, r2, #4 258 blt copy_less_than_4 /* If n < 4. */ 259 260 bl word_aligned 261 262 .syntax unified 263 .global __aeabi_memcpy8 264 .type __aeabi_memcpy8, %function 265__aeabi_memcpy8: 266 /* Assumes that both of its arguments are 8-byte aligned. */ 267 268 push {r0, lr} 269 strd r4, r5, [sp, #-8]! 270 271 /* Is there at least 4 bytes to copy? */ 272 subs r2, r2, #4 273 blt copy_less_than_4 /* If n < 4. */ 274 275 /* Is there at least 8 bytes to copy? */ 276 subs r2, r2, #4 277 blt copy_less_than_8 /* If n < 8. */ 278 279 /* Is there at least 64 bytes to copy? */ 280 subs r2, r2, #56 281 blt copy_less_than_64 /* if n + 8 < 64. */ 282 283 bl two_word_aligned 284