1/* ANSI C standard library function memcpy. 2 3 Copyright (c) 2002-2008 Tensilica Inc. 4 5 Permission is hereby granted, free of charge, to any person obtaining 6 a copy of this software and associated documentation files (the 7 "Software"), to deal in the Software without restriction, including 8 without limitation the rights to use, copy, modify, merge, publish, 9 distribute, sublicense, and/or sell copies of the Software, and to 10 permit persons to whom the Software is furnished to do so, subject to 11 the following conditions: 12 13 The above copyright notice and this permission notice shall be included 14 in all copies or substantial portions of the Software. 15 16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ 23 24#include "xtensa-asm.h" 25 26/* If the Xtensa Unaligned Load Exception option is not used, this 27 code can run a few cycles faster by relying on the low address bits 28 being ignored. However, if the code is then run with an Xtensa ISS 29 client that checks for unaligned accesses, it will produce a lot of 30 warning messages. Set this flag to disable the use of unaligned 31 accesses and keep the ISS happy. */ 32 33#if XCHAL_UNALIGNED_LOAD_EXCEPTION || 1 34#define UNALIGNED_ADDRESSES_CHECKED 1 35#endif 36 37 38/* void *memcpy (void *dst, const void *src, size_t len) 39 40 The algorithm is as follows: 41 42 If the destination is unaligned, align it by conditionally 43 copying 1- and/or 2-byte pieces. 44 45 If the source is aligned, copy 16 bytes with a loop, and then finish up 46 with 8, 4, 2, and 1-byte copies conditional on the length. 47 48 Else (if source is unaligned), do the same, but use SRC to align the 49 source data. 50 51 This code tries to use fall-through branches for the common 52 case of aligned source and destination and multiple of 4 (or 8) length. */ 53 54 55/* Byte by byte copy. */ 56 57 .text 58 .begin schedule 59 .align XCHAL_INST_FETCH_WIDTH 60 .literal_position 61__memcpy_aux: 62 63 /* Skip bytes to get proper alignment for three-byte loop */ 64.skip XCHAL_INST_FETCH_WIDTH - 3 65 66.Lbytecopy: 67#if XCHAL_HAVE_LOOPS 68 loopnez a4, 2f 69#else 70 beqz a4, 2f 71 add a7, a3, a4 // a7 = end address for source 72#endif 731: l8ui a6, a3, 0 74 addi a3, a3, 1 75#ifdef PSRAM_FIX 76 nop 77 nop 78 nop 79#endif 80 s8i a6, a5, 0 81 addi a5, a5, 1 82#ifdef PSRAM_FIX 83 memw 84#endif 85#if !XCHAL_HAVE_LOOPS 86 bltu a3, a7, 1b 87#endif 882: leaf_return 89 90 91/* Destination is unaligned. */ 92 93 .align 4 94.Ldst1mod2: // dst is only byte aligned 95 96 /* Do short copies byte-by-byte. */ 97 bltui a4, 7, .Lbytecopy 98 99 /* Copy 1 byte. */ 100 l8ui a6, a3, 0 101 addi a3, a3, 1 102 addi a4, a4, -1 103 s8i a6, a5, 0 104#ifdef PSRAM_FIX 105 memw 106#endif 107 addi a5, a5, 1 108 109 /* Return to main algorithm if dst is now aligned. */ 110 bbci.l a5, 1, .Ldstaligned 111 112.Ldst2mod4: // dst has 16-bit alignment 113 114 /* Do short copies byte-by-byte. */ 115 bltui a4, 6, .Lbytecopy 116 117 /* Copy 2 bytes. */ 118 l8ui a6, a3, 0 119 l8ui a7, a3, 1 120 addi a3, a3, 2 121 addi a4, a4, -2 122 s8i a6, a5, 0 123 s8i a7, a5, 1 124#ifdef PSRAM_FIX 125 memw 126#endif 127 addi a5, a5, 2 128 129 /* dst is now aligned; return to main algorithm. */ 130 j .Ldstaligned 131 132 133 .align 4 134 .global memcpy 135 .type memcpy, @function 136memcpy: 137 leaf_entry sp, 16 138 /* a2 = dst, a3 = src, a4 = len */ 139 140 mov a5, a2 // copy dst so that a2 is return value 141 bbsi.l a2, 0, .Ldst1mod2 142 bbsi.l a2, 1, .Ldst2mod4 143.Ldstaligned: 144 145 /* Get number of loop iterations with 16B per iteration. */ 146 srli a7, a4, 4 147 148 /* Check if source is aligned. */ 149 slli a8, a3, 30 150 bnez a8, .Lsrcunaligned 151 152 /* Destination and source are word-aligned, use word copy. */ 153#if XCHAL_HAVE_LOOPS 154 loopnez a7, 2f 155#else 156 beqz a7, 2f 157 slli a8, a7, 4 158 add a8, a8, a3 // a8 = end of last 16B source chunk 159#endif 160 161#ifndef PSRAM_FIX 162 1631: l32i a6, a3, 0 //HYP: memw after this is fix-ish? 164 l32i a7, a3, 4 165 s32i a6, a5, 0 166 l32i a6, a3, 8 167 s32i a7, a5, 4 168 l32i a7, a3, 12 169 s32i a6, a5, 8 170 addi a3, a3, 16 171 s32i a7, a5, 12 172 addi a5, a5, 16 173 174#else 1751: l32i a6, a3, 0 176 l32i a7, a3, 4 177 s32i a6, a5, 0 178 s32i a7, a5, 4 179 memw 180 l32i a6, a3, 8 181 l32i a7, a3, 12 182 s32i a6, a5, 8 183 s32i a7, a5, 12 184 memw 185 186 addi a3, a3, 16 187 addi a5, a5, 16 188 189#endif 190 191 192#if !XCHAL_HAVE_LOOPS 193 bltu a3, a8, 1b 194#endif 195 196 /* Copy any leftover pieces smaller than 16B. */ 1972: bbci.l a4, 3, 3f 198 199 /* Copy 8 bytes. */ 200 l32i a6, a3, 0 201 l32i a7, a3, 4 202 addi a3, a3, 8 203 s32i a6, a5, 0 204 s32i a7, a5, 4 205 addi a5, a5, 8 206 2073: bbsi.l a4, 2, 4f 208 bbsi.l a4, 1, 5f 209 bbsi.l a4, 0, 6f 210#ifdef PSRAM_FIX 211 memw 212#endif 213 leaf_return 214 215 .align 4 216 /* Copy 4 bytes. */ 2174: l32i a6, a3, 0 218 addi a3, a3, 4 219 s32i a6, a5, 0 220 addi a5, a5, 4 221 bbsi.l a4, 1, 5f 222 bbsi.l a4, 0, 6f 223#ifdef PSRAM_FIX 224 memw 225#endif 226 leaf_return 227 228 /* Copy 2 bytes. */ 2295: l16ui a6, a3, 0 230 addi a3, a3, 2 231 s16i a6, a5, 0 232 addi a5, a5, 2 233 bbsi.l a4, 0, 6f 234#ifdef PSRAM_FIX 235 memw 236#endif 237 leaf_return 238 239 /* Copy 1 byte. */ 2406: l8ui a6, a3, 0 241 s8i a6, a5, 0 242 243.Ldone: 244#ifdef PSRAM_FIX 245 memw 246#endif 247 leaf_return 248 249 250/* Destination is aligned; source is unaligned. */ 251 252 .align 4 253.Lsrcunaligned: 254 /* Avoid loading anything for zero-length copies. */ 255 beqz a4, .Ldone 256 257 /* Copy 16 bytes per iteration for word-aligned dst and 258 unaligned src. */ 259 ssa8 a3 // set shift amount from byte offset 260#if UNALIGNED_ADDRESSES_CHECKED 261 srli a11, a8, 30 // save unalignment offset for below 262 sub a3, a3, a11 // align a3 263#endif 264 l32i a6, a3, 0 // load first word 265#if XCHAL_HAVE_LOOPS 266 loopnez a7, 2f 267#else 268 beqz a7, 2f 269 slli a10, a7, 4 270 add a10, a10, a3 // a10 = end of last 16B source chunk 271#endif 2721: l32i a7, a3, 4 273 l32i a8, a3, 8 274 src_b a6, a6, a7 275 s32i a6, a5, 0 276 l32i a9, a3, 12 277 src_b a7, a7, a8 278 s32i a7, a5, 4 279 l32i a6, a3, 16 280 src_b a8, a8, a9 281 s32i a8, a5, 8 282 addi a3, a3, 16 283 src_b a9, a9, a6 284 s32i a9, a5, 12 285 addi a5, a5, 16 286#if !XCHAL_HAVE_LOOPS 287 bltu a3, a10, 1b 288#endif 289 2902: bbci.l a4, 3, 3f 291 292 /* Copy 8 bytes. */ 293 l32i a7, a3, 4 294 l32i a8, a3, 8 295 src_b a6, a6, a7 296 s32i a6, a5, 0 297 addi a3, a3, 8 298 src_b a7, a7, a8 299 s32i a7, a5, 4 300 addi a5, a5, 8 301 mov a6, a8 302 3033: bbci.l a4, 2, 4f 304 305 /* Copy 4 bytes. */ 306 l32i a7, a3, 4 307 addi a3, a3, 4 308 src_b a6, a6, a7 309 s32i a6, a5, 0 310 addi a5, a5, 4 311 mov a6, a7 3124: 313#if UNALIGNED_ADDRESSES_CHECKED 314 add a3, a3, a11 // readjust a3 with correct misalignment 315#endif 316 bbsi.l a4, 1, 5f 317 bbsi.l a4, 0, 6f 318 leaf_return 319 320 /* Copy 2 bytes. */ 3215: l8ui a6, a3, 0 322 l8ui a7, a3, 1 323 addi a3, a3, 2 324 s8i a6, a5, 0 325 s8i a7, a5, 1 326 addi a5, a5, 2 327 bbsi.l a4, 0, 6f 328#ifdef PSRAM_FIX 329 memw 330#endif 331 leaf_return 332 333 /* Copy 1 byte. */ 3346: l8ui a6, a3, 0 335 s8i a6, a5, 0 336#ifdef PSRAM_FIX 337 memw 338#endif 339 leaf_return 340 341 .end schedule 342 343 .size memcpy, . - memcpy 344 345 346