1/* ANSI C standard library function memcpy. 2 3 Copyright (c) 2002-2008 Tensilica Inc. 4 5 Permission is hereby granted, free of charge, to any person obtaining 6 a copy of this software and associated documentation files (the 7 "Software"), to deal in the Software without restriction, including 8 without limitation the rights to use, copy, modify, merge, publish, 9 distribute, sublicense, and/or sell copies of the Software, and to 10 permit persons to whom the Software is furnished to do so, subject to 11 the following conditions: 12 13 The above copyright notice and this permission notice shall be included 14 in all copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ 23 24#include <picolibc.h> 25 26#include "xtensa-asm.h" 27 28/* If the Xtensa Unaligned Load Exception option is not used, this 29 code can run a few cycles faster by relying on the low address bits 30 being ignored. However, if the code is then run with an Xtensa ISS 31 client that checks for unaligned accesses, it will produce a lot of 32 warning messages. Set this flag to disable the use of unaligned 33 accesses and keep the ISS happy. */ 34 35/* #define UNALIGNED_ADDRESSES_CHECKED XCHAL_UNALIGNED_LOAD_EXCEPTION */ 36#define UNALIGNED_ADDRESSES_CHECKED 1 37 38 39/* void *memcpy (void *dst, const void *src, size_t len) 40 41 The algorithm is as follows: 42 43 If the destination is unaligned, align it by conditionally 44 copying 1- and/or 2-byte pieces. 45 46 If the source is aligned, copy 16 bytes with a loop, and then finish up 47 with 8, 4, 2, and 1-byte copies conditional on the length. 48 49 Else (if source is unaligned), do the same, but use SRC to align the 50 source data. 51 52 This code tries to use fall-through branches for the common 53 case of aligned source and destination and multiple of 4 (or 8) length. */ 54 55 56/* Byte by byte copy. */ 57 58 .text 59 .begin schedule 60 .align XCHAL_INST_FETCH_WIDTH 61 .literal_position 62__memcpy_aux: 63 64 /* Skip bytes to get proper alignment for three-byte loop */ 65.skip XCHAL_INST_FETCH_WIDTH - 3 66 67.Lbytecopy: 68#if XCHAL_HAVE_LOOPS 69 loopnez a4, 2f 70#else 71 beqz a4, 2f 72 add a7, a3, a4 // a7 = end address for source 73#endif 741: l8ui a6, a3, 0 75 addi a3, a3, 1 76#if XTENSA_ESP32_PSRAM_CACHE_FIX 77 nop 78 nop 79 nop 80#endif 81 s8i a6, a5, 0 82 addi a5, a5, 1 83#if XTENSA_ESP32_PSRAM_CACHE_FIX 84 memw 85#endif 86#if !XCHAL_HAVE_LOOPS 87 bltu a3, a7, 1b 88#endif 892: leaf_return 90 91 92/* Destination is unaligned. */ 93 94 .align 4 95.Ldst1mod2: // dst is only byte aligned 96 97 /* Do short copies byte-by-byte. */ 98 bltui a4, 7, .Lbytecopy 99 100 /* Copy 1 byte. */ 101 l8ui a6, a3, 0 102 addi a3, a3, 1 103 addi a4, a4, -1 104 s8i a6, a5, 0 105#if XTENSA_ESP32_PSRAM_CACHE_FIX 106 memw 107#endif 108 addi a5, a5, 1 109 110 /* Return to main algorithm if dst is now aligned. */ 111 bbci.l a5, 1, .Ldstaligned 112 113.Ldst2mod4: // dst has 16-bit alignment 114 115 /* Do short copies byte-by-byte. */ 116 bltui a4, 6, .Lbytecopy 117 118 /* Copy 2 bytes. */ 119 l8ui a6, a3, 0 120 l8ui a7, a3, 1 121 addi a3, a3, 2 122 addi a4, a4, -2 123 s8i a6, a5, 0 124 s8i a7, a5, 1 125#if XTENSA_ESP32_PSRAM_CACHE_FIX 126 memw 127#endif 128 addi a5, a5, 2 129 130 /* dst is now aligned; return to main algorithm. */ 131 j .Ldstaligned 132 133 134 .align 4 135 .global memcpy 136 .type memcpy, @function 137memcpy: 138 leaf_entry sp, 16 139 /* a2 = dst, a3 = src, a4 = len */ 140 141 mov a5, a2 // copy dst so that a2 is return value 142 bbsi.l a2, 0, .Ldst1mod2 143 bbsi.l a2, 1, .Ldst2mod4 144.Ldstaligned: 145 146 /* Get number of loop iterations with 16B per iteration. */ 147 srli a7, a4, 4 148 149 /* Check if source is aligned. */ 150 slli a8, a3, 30 151 bnez a8, .Lsrcunaligned 152 153 /* Destination and source are word-aligned, use word copy. */ 154#if XCHAL_HAVE_LOOPS 155 loopnez a7, 2f 156#else 157 beqz a7, 2f 158 slli a8, a7, 4 159 add a8, a8, a3 // a8 = end of last 16B source chunk 160#endif 161 162#if XTENSA_ESP32_PSRAM_CACHE_FIX 163 1641: l32i a6, a3, 0 165 l32i a7, a3, 4 166 s32i a6, a5, 0 167 s32i a7, a5, 4 168 memw 169 l32i a6, a3, 8 170 l32i a7, a3, 12 171 s32i a6, a5, 8 172 s32i a7, a5, 12 173 memw 174 175 addi a3, a3, 16 176 addi a5, a5, 16 177 178#else 179 1801: l32i a6, a3, 0 181 l32i a7, a3, 4 182 s32i a6, a5, 0 183 l32i a6, a3, 8 184 s32i a7, a5, 4 185 l32i a7, a3, 12 186 s32i a6, a5, 8 187 addi a3, a3, 16 188 s32i a7, a5, 12 189 addi a5, a5, 16 190 191#endif 192 193 194#if !XCHAL_HAVE_LOOPS 195 bltu a3, a8, 1b 196#endif 197 198 /* Copy any leftover pieces smaller than 16B. */ 1992: bbci.l a4, 3, 3f 200 201 /* Copy 8 bytes. */ 202 l32i a6, a3, 0 203 l32i a7, a3, 4 204 addi a3, a3, 8 205 s32i a6, a5, 0 206 s32i a7, a5, 4 207 addi a5, a5, 8 208 2093: bbsi.l a4, 2, 4f 210 bbsi.l a4, 1, 5f 211 bbsi.l a4, 0, 6f 212#if XTENSA_ESP32_PSRAM_CACHE_FIX 213 memw 214#endif 215 leaf_return 216 217 .align 4 218 /* Copy 4 bytes. */ 2194: l32i a6, a3, 0 220 addi a3, a3, 4 221 s32i a6, a5, 0 222 addi a5, a5, 4 223 bbsi.l a4, 1, 5f 224 bbsi.l a4, 0, 6f 225#if XTENSA_ESP32_PSRAM_CACHE_FIX 226 memw 227#endif 228 leaf_return 229 230 /* Copy 2 bytes. */ 2315: l16ui a6, a3, 0 232 addi a3, a3, 2 233 s16i a6, a5, 0 234 addi a5, a5, 2 235 bbsi.l a4, 0, 6f 236#if XTENSA_ESP32_PSRAM_CACHE_FIX 237 memw 238#endif 239 leaf_return 240 241 /* Copy 1 byte. */ 2426: l8ui a6, a3, 0 243 s8i a6, a5, 0 244 245.Ldone: 246#if XTENSA_ESP32_PSRAM_CACHE_FIX 247 memw 248#endif 249 leaf_return 250 251 252/* Destination is aligned; source is unaligned. */ 253 254 .align 4 255.Lsrcunaligned: 256 /* Avoid loading anything for zero-length copies. */ 257 beqz a4, .Ldone 258 259 /* Copy 16 bytes per iteration for word-aligned dst and 260 unaligned src. */ 261 ssa8 a3 // set shift amount from byte offset 262#if UNALIGNED_ADDRESSES_CHECKED 263 srli a11, a8, 30 // save unalignment offset for below 264 sub a3, a3, a11 // align a3 265#endif 266 l32i a6, a3, 0 // load first word 267#if XCHAL_HAVE_LOOPS 268 loopnez a7, 2f 269#else 270 beqz a7, 2f 271 slli a10, a7, 4 272 add a10, a10, a3 // a10 = end of last 16B source chunk 273#endif 2741: l32i a7, a3, 4 275 l32i a8, a3, 8 276 src_b a6, a6, a7 277 s32i a6, a5, 0 278 l32i a9, a3, 12 279 src_b a7, a7, a8 280 s32i a7, a5, 4 281 l32i a6, a3, 16 282 src_b a8, a8, a9 283 s32i a8, a5, 8 284 addi a3, a3, 16 285 src_b a9, a9, a6 286 s32i a9, a5, 12 287 addi a5, a5, 16 288#if !XCHAL_HAVE_LOOPS 289 bltu a3, a10, 1b 290#endif 291 2922: bbci.l a4, 3, 3f 293 294 /* Copy 8 bytes. */ 295 l32i a7, a3, 4 296 l32i a8, a3, 8 297 src_b a6, a6, a7 298 s32i a6, a5, 0 299 addi a3, a3, 8 300 src_b a7, a7, a8 301 s32i a7, a5, 4 302 addi a5, a5, 8 303 mov a6, a8 304 3053: bbci.l a4, 2, 4f 306 307 /* Copy 4 bytes. */ 308 l32i a7, a3, 4 309 addi a3, a3, 4 310 src_b a6, a6, a7 311 s32i a6, a5, 0 312 addi a5, a5, 4 313 mov a6, a7 3144: 315#if UNALIGNED_ADDRESSES_CHECKED 316 add a3, a3, a11 // readjust a3 with correct misalignment 317#endif 318 bbsi.l a4, 1, 5f 319 bbsi.l a4, 0, 6f 320 leaf_return 321 322 /* Copy 2 bytes. */ 3235: l8ui a6, a3, 0 324 l8ui a7, a3, 1 325 addi a3, a3, 2 326 s8i a6, a5, 0 327 s8i a7, a5, 1 328 addi a5, a5, 2 329 bbsi.l a4, 0, 6f 330#if XTENSA_ESP32_PSRAM_CACHE_FIX 331 memw 332#endif 333 leaf_return 334 335 /* Copy 1 byte. */ 3366: l8ui a6, a3, 0 337 s8i a6, a5, 0 338#if XTENSA_ESP32_PSRAM_CACHE_FIX 339 memw 340#endif 341 leaf_return 342 343 .end schedule 344 345 .size memcpy, . - memcpy 346