1/* ANSI C standard library function memcpy. 2 3 Copyright (c) 2002-2008 Tensilica Inc. 4 5 Permission is hereby granted, free of charge, to any person obtaining 6 a copy of this software and associated documentation files (the 7 "Software"), to deal in the Software without restriction, including 8 without limitation the rights to use, copy, modify, merge, publish, 9 distribute, sublicense, and/or sell copies of the Software, and to 10 permit persons to whom the Software is furnished to do so, subject to 11 the following conditions: 12 13 The above copyright notice and this permission notice shall be included 14 in all copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ 23 24#include "xtensa-asm.h" 25 26/* If the Xtensa Unaligned Load Exception option is not used, this 27 code can run a few cycles faster by relying on the low address bits 28 being ignored. However, if the code is then run with an Xtensa ISS 29 client that checks for unaligned accesses, it will produce a lot of 30 warning messages. Set this flag to disable the use of unaligned 31 accesses and keep the ISS happy. */ 32 33/* #define UNALIGNED_ADDRESSES_CHECKED XCHAL_UNALIGNED_LOAD_EXCEPTION */ 34#define UNALIGNED_ADDRESSES_CHECKED 1 35 36 37/* void *memcpy (void *dst, const void *src, size_t len) 38 39 The algorithm is as follows: 40 41 If the destination is unaligned, align it by conditionally 42 copying 1- and/or 2-byte pieces. 43 44 If the source is aligned, copy 16 bytes with a loop, and then finish up 45 with 8, 4, 2, and 1-byte copies conditional on the length. 46 47 Else (if source is unaligned), do the same, but use SRC to align the 48 source data. 49 50 This code tries to use fall-through branches for the common 51 case of aligned source and destination and multiple of 4 (or 8) length. */ 52 53 54/* Byte by byte copy. */ 55 56 .text 57 .begin schedule 58 .align XCHAL_INST_FETCH_WIDTH 59 .literal_position 60__memcpy_aux: 61 62 /* Skip bytes to get proper alignment for three-byte loop */ 63.skip XCHAL_INST_FETCH_WIDTH - 3 64 65.Lbytecopy: 66#if XCHAL_HAVE_LOOPS 67 loopnez a4, 2f 68#else 69 beqz a4, 2f 70 add a7, a3, a4 // a7 = end address for source 71#endif 721: l8ui a6, a3, 0 73 addi a3, a3, 1 74#if XTENSA_ESP32_PSRAM_CACHE_FIX 75 nop 76 nop 77 nop 78#endif 79 s8i a6, a5, 0 80 addi a5, a5, 1 81#if XTENSA_ESP32_PSRAM_CACHE_FIX 82 memw 83#endif 84#if !XCHAL_HAVE_LOOPS 85 bltu a3, a7, 1b 86#endif 872: leaf_return 88 89 90/* Destination is unaligned. */ 91 92 .align 4 93.Ldst1mod2: // dst is only byte aligned 94 95 /* Do short copies byte-by-byte. */ 96 bltui a4, 7, .Lbytecopy 97 98 /* Copy 1 byte. */ 99 l8ui a6, a3, 0 100 addi a3, a3, 1 101 addi a4, a4, -1 102 s8i a6, a5, 0 103#if XTENSA_ESP32_PSRAM_CACHE_FIX 104 memw 105#endif 106 addi a5, a5, 1 107 108 /* Return to main algorithm if dst is now aligned. */ 109 bbci.l a5, 1, .Ldstaligned 110 111.Ldst2mod4: // dst has 16-bit alignment 112 113 /* Do short copies byte-by-byte. */ 114 bltui a4, 6, .Lbytecopy 115 116 /* Copy 2 bytes. */ 117 l8ui a6, a3, 0 118 l8ui a7, a3, 1 119 addi a3, a3, 2 120 addi a4, a4, -2 121 s8i a6, a5, 0 122 s8i a7, a5, 1 123#if XTENSA_ESP32_PSRAM_CACHE_FIX 124 memw 125#endif 126 addi a5, a5, 2 127 128 /* dst is now aligned; return to main algorithm. */ 129 j .Ldstaligned 130 131 132 .align 4 133 .global memcpy 134 .type memcpy, @function 135memcpy: 136 leaf_entry sp, 16 137 /* a2 = dst, a3 = src, a4 = len */ 138 139 mov a5, a2 // copy dst so that a2 is return value 140 bbsi.l a2, 0, .Ldst1mod2 141 bbsi.l a2, 1, .Ldst2mod4 142.Ldstaligned: 143 144 /* Get number of loop iterations with 16B per iteration. */ 145 srli a7, a4, 4 146 147 /* Check if source is aligned. */ 148 slli a8, a3, 30 149 bnez a8, .Lsrcunaligned 150 151 /* Destination and source are word-aligned, use word copy. */ 152#if XCHAL_HAVE_LOOPS 153 loopnez a7, 2f 154#else 155 beqz a7, 2f 156 slli a8, a7, 4 157 add a8, a8, a3 // a8 = end of last 16B source chunk 158#endif 159 160#if XTENSA_ESP32_PSRAM_CACHE_FIX 161 1621: l32i a6, a3, 0 163 l32i a7, a3, 4 164 s32i a6, a5, 0 165 s32i a7, a5, 4 166 memw 167 l32i a6, a3, 8 168 l32i a7, a3, 12 169 s32i a6, a5, 8 170 s32i a7, a5, 12 171 memw 172 173 addi a3, a3, 16 174 addi a5, a5, 16 175 176#else 177 1781: l32i a6, a3, 0 179 l32i a7, a3, 4 180 s32i a6, a5, 0 181 l32i a6, a3, 8 182 s32i a7, a5, 4 183 l32i a7, a3, 12 184 s32i a6, a5, 8 185 addi a3, a3, 16 186 s32i a7, a5, 12 187 addi a5, a5, 16 188 189#endif 190 191 192#if !XCHAL_HAVE_LOOPS 193 bltu a3, a8, 1b 194#endif 195 196 /* Copy any leftover pieces smaller than 16B. */ 1972: bbci.l a4, 3, 3f 198 199 /* Copy 8 bytes. */ 200 l32i a6, a3, 0 201 l32i a7, a3, 4 202 addi a3, a3, 8 203 s32i a6, a5, 0 204 s32i a7, a5, 4 205 addi a5, a5, 8 206 2073: bbsi.l a4, 2, 4f 208 bbsi.l a4, 1, 5f 209 bbsi.l a4, 0, 6f 210#if XTENSA_ESP32_PSRAM_CACHE_FIX 211 memw 212#endif 213 leaf_return 214 215 .align 4 216 /* Copy 4 bytes. */ 2174: l32i a6, a3, 0 218 addi a3, a3, 4 219 s32i a6, a5, 0 220 addi a5, a5, 4 221 bbsi.l a4, 1, 5f 222 bbsi.l a4, 0, 6f 223#if XTENSA_ESP32_PSRAM_CACHE_FIX 224 memw 225#endif 226 leaf_return 227 228 /* Copy 2 bytes. */ 2295: l16ui a6, a3, 0 230 addi a3, a3, 2 231 s16i a6, a5, 0 232 addi a5, a5, 2 233 bbsi.l a4, 0, 6f 234#if XTENSA_ESP32_PSRAM_CACHE_FIX 235 memw 236#endif 237 leaf_return 238 239 /* Copy 1 byte. */ 2406: l8ui a6, a3, 0 241 s8i a6, a5, 0 242 243.Ldone: 244#if XTENSA_ESP32_PSRAM_CACHE_FIX 245 memw 246#endif 247 leaf_return 248 249 250/* Destination is aligned; source is unaligned. */ 251 252 .align 4 253.Lsrcunaligned: 254 /* Avoid loading anything for zero-length copies. */ 255 beqz a4, .Ldone 256 257 /* Copy 16 bytes per iteration for word-aligned dst and 258 unaligned src. */ 259 ssa8 a3 // set shift amount from byte offset 260#if UNALIGNED_ADDRESSES_CHECKED 261 srli a11, a8, 30 // save unalignment offset for below 262 sub a3, a3, a11 // align a3 263#endif 264 l32i a6, a3, 0 // load first word 265#if XCHAL_HAVE_LOOPS 266 loopnez a7, 2f 267#else 268 beqz a7, 2f 269 slli a10, a7, 4 270 add a10, a10, a3 // a10 = end of last 16B source chunk 271#endif 2721: l32i a7, a3, 4 273 l32i a8, a3, 8 274 src_b a6, a6, a7 275 s32i a6, a5, 0 276 l32i a9, a3, 12 277 src_b a7, a7, a8 278 s32i a7, a5, 4 279 l32i a6, a3, 16 280 src_b a8, a8, a9 281 s32i a8, a5, 8 282 addi a3, a3, 16 283 src_b a9, a9, a6 284 s32i a9, a5, 12 285 addi a5, a5, 16 286#if !XCHAL_HAVE_LOOPS 287 bltu a3, a10, 1b 288#endif 289 2902: bbci.l a4, 3, 3f 291 292 /* Copy 8 bytes. */ 293 l32i a7, a3, 4 294 l32i a8, a3, 8 295 src_b a6, a6, a7 296 s32i a6, a5, 0 297 addi a3, a3, 8 298 src_b a7, a7, a8 299 s32i a7, a5, 4 300 addi a5, a5, 8 301 mov a6, a8 302 3033: bbci.l a4, 2, 4f 304 305 /* Copy 4 bytes. */ 306 l32i a7, a3, 4 307 addi a3, a3, 4 308 src_b a6, a6, a7 309 s32i a6, a5, 0 310 addi a5, a5, 4 311 mov a6, a7 3124: 313#if UNALIGNED_ADDRESSES_CHECKED 314 add a3, a3, a11 // readjust a3 with correct misalignment 315#endif 316 bbsi.l a4, 1, 5f 317 bbsi.l a4, 0, 6f 318 leaf_return 319 320 /* Copy 2 bytes. */ 3215: l8ui a6, a3, 0 322 l8ui a7, a3, 1 323 addi a3, a3, 2 324 s8i a6, a5, 0 325 s8i a7, a5, 1 326 addi a5, a5, 2 327 bbsi.l a4, 0, 6f 328#if XTENSA_ESP32_PSRAM_CACHE_FIX 329 memw 330#endif 331 leaf_return 332 333 /* Copy 1 byte. */ 3346: l8ui a6, a3, 0 335 s8i a6, a5, 0 336#if XTENSA_ESP32_PSRAM_CACHE_FIX 337 memw 338#endif 339 leaf_return 340 341 .end schedule 342 343 .size memcpy, . - memcpy 344