1/* Copyright (c) 2013, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 * Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 * Neither the name of Linaro Limited nor the names of its 16 contributors may be used to endorse or promote products derived 17 from this software without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 This memcpy routine is optimised for Cortex-A15 cores and takes advantage 32 of VFP or NEON when built with the appropriate flags. 33 34 Assumptions: 35 36 ARMv6 (ARMv7-a if using Neon) 37 ARM state 38 Unaligned accesses 39 LDRD/STRD support unaligned word accesses 40 41 If compiled with GCC, this file should be enclosed within following 42 pre-processing check: 43 if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) 44 45 */ 46 .syntax unified 47 /* This implementation requires ARM state. */ 48 .arm 49 50#ifdef __ARM_NEON__ 51 52 .fpu neon 53 .arch armv7-a 54# define FRAME_SIZE 4 55# define USE_VFP 56# define USE_NEON 57 58#elif !defined (__SOFTFP__) 59 60 .arch armv6 61 .fpu vfpv2 62# define FRAME_SIZE 32 63# define USE_VFP 64 65#else 66 .arch armv6 67# define FRAME_SIZE 32 68 69#endif 70 71/* Old versions of GAS incorrectly implement the NEON align semantics. */ 72#ifdef BROKEN_ASM_NEON_ALIGN 73#define ALIGN(addr, align) addr,:align 74#else 75#define ALIGN(addr, align) addr:align 76#endif 77 78#define PC_OFFSET 8 /* PC pipeline compensation. */ 79#define INSN_SIZE 4 80 81/* Call parameters. */ 82#define dstin r0 83#define src r1 84#define count r2 85 86/* Locals. */ 87#define tmp1 r3 88#define dst ip 89#define tmp2 r10 90 91#ifndef USE_NEON 92/* For bulk copies using GP registers. */ 93#define A_l r2 /* Call-clobbered. */ 94#define A_h r3 /* Call-clobbered. */ 95#define B_l r4 96#define B_h r5 97#define C_l r6 98#define C_h r7 99#define D_l r8 100#define D_h r9 101#endif 102 103/* Number of lines ahead to pre-fetch data. If you change this the code 104 below will need adjustment to compensate. */ 105 106#define prefetch_lines 5 107 108#ifdef USE_VFP 109 .macro cpy_line_vfp vreg, base 110 vstr \vreg, [dst, #\base] 111 vldr \vreg, [src, #\base] 112 vstr d0, [dst, #\base + 8] 113 vldr d0, [src, #\base + 8] 114 vstr d1, [dst, #\base + 16] 115 vldr d1, [src, #\base + 16] 116 vstr d2, [dst, #\base + 24] 117 vldr d2, [src, #\base + 24] 118 vstr \vreg, [dst, #\base + 32] 119 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] 120 vstr d0, [dst, #\base + 40] 121 vldr d0, [src, #\base + 40] 122 vstr d1, [dst, #\base + 48] 123 vldr d1, [src, #\base + 48] 124 vstr d2, [dst, #\base + 56] 125 vldr d2, [src, #\base + 56] 126 .endm 127 128 .macro cpy_tail_vfp vreg, base 129 vstr \vreg, [dst, #\base] 130 vldr \vreg, [src, #\base] 131 vstr d0, [dst, #\base + 8] 132 vldr d0, [src, #\base + 8] 133 vstr d1, [dst, #\base + 16] 134 vldr d1, [src, #\base + 16] 135 vstr d2, [dst, #\base + 24] 136 vldr d2, [src, #\base + 24] 137 vstr \vreg, [dst, #\base + 32] 138 vstr d0, [dst, #\base + 40] 139 vldr d0, [src, #\base + 40] 140 vstr d1, [dst, #\base + 48] 141 vldr d1, [src, #\base + 48] 142 vstr d2, [dst, #\base + 56] 143 vldr d2, [src, #\base + 56] 144 .endm 145#endif 146 147 .macro def_fn f p2align=0 148 .text 149 .p2align \p2align 150 .global \f 151 .type \f, %function 152\f: 153 .endm 154 155def_fn memcpy p2align=6 156 157 mov dst, dstin /* Preserve dstin, we need to return it. */ 158 cmp count, #64 159 bge .Lcpy_not_short 160 /* Deal with small copies quickly by dropping straight into the 161 exit block. */ 162 163.Ltail63unaligned: 164#ifdef USE_NEON 165 and tmp1, count, #0x38 166 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 167 add pc, pc, tmp1 168 vld1.8 {d0}, [src]! /* 14 words to go. */ 169 vst1.8 {d0}, [dst]! 170 vld1.8 {d0}, [src]! /* 12 words to go. */ 171 vst1.8 {d0}, [dst]! 172 vld1.8 {d0}, [src]! /* 10 words to go. */ 173 vst1.8 {d0}, [dst]! 174 vld1.8 {d0}, [src]! /* 8 words to go. */ 175 vst1.8 {d0}, [dst]! 176 vld1.8 {d0}, [src]! /* 6 words to go. */ 177 vst1.8 {d0}, [dst]! 178 vld1.8 {d0}, [src]! /* 4 words to go. */ 179 vst1.8 {d0}, [dst]! 180 vld1.8 {d0}, [src]! /* 2 words to go. */ 181 vst1.8 {d0}, [dst]! 182 183 tst count, #4 184 ldrne tmp1, [src], #4 185 strne tmp1, [dst], #4 186#else 187 /* Copy up to 15 full words of data. May not be aligned. */ 188 /* Cannot use VFP for unaligned data. */ 189 and tmp1, count, #0x3c 190 add dst, dst, tmp1 191 add src, src, tmp1 192 rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) 193 /* Jump directly into the sequence below at the correct offset. */ 194 add pc, pc, tmp1, lsl #1 195 196 ldr tmp1, [src, #-60] /* 15 words to go. */ 197 str tmp1, [dst, #-60] 198 199 ldr tmp1, [src, #-56] /* 14 words to go. */ 200 str tmp1, [dst, #-56] 201 ldr tmp1, [src, #-52] 202 str tmp1, [dst, #-52] 203 204 ldr tmp1, [src, #-48] /* 12 words to go. */ 205 str tmp1, [dst, #-48] 206 ldr tmp1, [src, #-44] 207 str tmp1, [dst, #-44] 208 209 ldr tmp1, [src, #-40] /* 10 words to go. */ 210 str tmp1, [dst, #-40] 211 ldr tmp1, [src, #-36] 212 str tmp1, [dst, #-36] 213 214 ldr tmp1, [src, #-32] /* 8 words to go. */ 215 str tmp1, [dst, #-32] 216 ldr tmp1, [src, #-28] 217 str tmp1, [dst, #-28] 218 219 ldr tmp1, [src, #-24] /* 6 words to go. */ 220 str tmp1, [dst, #-24] 221 ldr tmp1, [src, #-20] 222 str tmp1, [dst, #-20] 223 224 ldr tmp1, [src, #-16] /* 4 words to go. */ 225 str tmp1, [dst, #-16] 226 ldr tmp1, [src, #-12] 227 str tmp1, [dst, #-12] 228 229 ldr tmp1, [src, #-8] /* 2 words to go. */ 230 str tmp1, [dst, #-8] 231 ldr tmp1, [src, #-4] 232 str tmp1, [dst, #-4] 233#endif 234 235 lsls count, count, #31 236 ldrhcs tmp1, [src], #2 237 ldrbne src, [src] /* Src is dead, use as a scratch. */ 238 strhcs tmp1, [dst], #2 239 strbne src, [dst] 240 bx lr 241 242.Lcpy_not_short: 243 /* At least 64 bytes to copy, but don't know the alignment yet. */ 244 str tmp2, [sp, #-FRAME_SIZE]! 245 and tmp2, src, #7 246 and tmp1, dst, #7 247 cmp tmp1, tmp2 248 bne .Lcpy_notaligned 249 250#ifdef USE_VFP 251 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show 252 that the FP pipeline is much better at streaming loads and 253 stores. This is outside the critical loop. */ 254 vmov.f32 s0, s0 255#endif 256 257 /* SRC and DST have the same mutual 32-bit alignment, but we may 258 still need to pre-copy some bytes to get to natural alignment. 259 We bring DST into full 64-bit alignment. */ 260 lsls tmp2, dst, #29 261 beq 1f 262 rsbs tmp2, tmp2, #0 263 sub count, count, tmp2, lsr #29 264 ldrmi tmp1, [src], #4 265 strmi tmp1, [dst], #4 266 lsls tmp2, tmp2, #2 267 ldrhcs tmp1, [src], #2 268 ldrbne tmp2, [src], #1 269 strhcs tmp1, [dst], #2 270 strbne tmp2, [dst], #1 271 2721: 273 subs tmp2, count, #64 /* Use tmp2 for count. */ 274 blt .Ltail63aligned 275 276 cmp tmp2, #512 277 bge .Lcpy_body_long 278 279.Lcpy_body_medium: /* Count in tmp2. */ 280#ifdef USE_VFP 2811: 282 vldr d0, [src, #0] 283 subs tmp2, tmp2, #64 284 vldr d1, [src, #8] 285 vstr d0, [dst, #0] 286 vldr d0, [src, #16] 287 vstr d1, [dst, #8] 288 vldr d1, [src, #24] 289 vstr d0, [dst, #16] 290 vldr d0, [src, #32] 291 vstr d1, [dst, #24] 292 vldr d1, [src, #40] 293 vstr d0, [dst, #32] 294 vldr d0, [src, #48] 295 vstr d1, [dst, #40] 296 vldr d1, [src, #56] 297 vstr d0, [dst, #48] 298 add src, src, #64 299 vstr d1, [dst, #56] 300 add dst, dst, #64 301 bge 1b 302 tst tmp2, #0x3f 303 beq .Ldone 304 305.Ltail63aligned: /* Count in tmp2. */ 306 and tmp1, tmp2, #0x38 307 add dst, dst, tmp1 308 add src, src, tmp1 309 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 310 add pc, pc, tmp1 311 312 vldr d0, [src, #-56] /* 14 words to go. */ 313 vstr d0, [dst, #-56] 314 vldr d0, [src, #-48] /* 12 words to go. */ 315 vstr d0, [dst, #-48] 316 vldr d0, [src, #-40] /* 10 words to go. */ 317 vstr d0, [dst, #-40] 318 vldr d0, [src, #-32] /* 8 words to go. */ 319 vstr d0, [dst, #-32] 320 vldr d0, [src, #-24] /* 6 words to go. */ 321 vstr d0, [dst, #-24] 322 vldr d0, [src, #-16] /* 4 words to go. */ 323 vstr d0, [dst, #-16] 324 vldr d0, [src, #-8] /* 2 words to go. */ 325 vstr d0, [dst, #-8] 326#else 327 sub src, src, #8 328 sub dst, dst, #8 3291: 330 ldrd A_l, A_h, [src, #8] 331 strd A_l, A_h, [dst, #8] 332 ldrd A_l, A_h, [src, #16] 333 strd A_l, A_h, [dst, #16] 334 ldrd A_l, A_h, [src, #24] 335 strd A_l, A_h, [dst, #24] 336 ldrd A_l, A_h, [src, #32] 337 strd A_l, A_h, [dst, #32] 338 ldrd A_l, A_h, [src, #40] 339 strd A_l, A_h, [dst, #40] 340 ldrd A_l, A_h, [src, #48] 341 strd A_l, A_h, [dst, #48] 342 ldrd A_l, A_h, [src, #56] 343 strd A_l, A_h, [dst, #56] 344 ldrd A_l, A_h, [src, #64]! 345 strd A_l, A_h, [dst, #64]! 346 subs tmp2, tmp2, #64 347 bge 1b 348 tst tmp2, #0x3f 349 bne 1f 350 ldr tmp2,[sp], #FRAME_SIZE 351 bx lr 3521: 353 add src, src, #8 354 add dst, dst, #8 355 356.Ltail63aligned: /* Count in tmp2. */ 357 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but 358 we know that the src and dest are 32-bit aligned so we can use 359 LDRD/STRD to improve efficiency. */ 360 /* TMP2 is now negative, but we don't care about that. The bottom 361 six bits still tell us how many bytes are left to copy. */ 362 363 and tmp1, tmp2, #0x38 364 add dst, dst, tmp1 365 add src, src, tmp1 366 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 367 add pc, pc, tmp1 368 ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ 369 strd A_l, A_h, [dst, #-56] 370 ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ 371 strd A_l, A_h, [dst, #-48] 372 ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ 373 strd A_l, A_h, [dst, #-40] 374 ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ 375 strd A_l, A_h, [dst, #-32] 376 ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ 377 strd A_l, A_h, [dst, #-24] 378 ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ 379 strd A_l, A_h, [dst, #-16] 380 ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ 381 strd A_l, A_h, [dst, #-8] 382 383#endif 384 tst tmp2, #4 385 ldrne tmp1, [src], #4 386 strne tmp1, [dst], #4 387 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ 388 ldrhcs tmp1, [src], #2 389 ldrbne tmp2, [src] 390 strhcs tmp1, [dst], #2 391 strbne tmp2, [dst] 392 393.Ldone: 394 ldr tmp2, [sp], #FRAME_SIZE 395 bx lr 396 397.Lcpy_body_long: /* Count in tmp2. */ 398 399 /* Long copy. We know that there's at least (prefetch_lines * 64) 400 bytes to go. */ 401#ifdef USE_VFP 402 /* Don't use PLD. Instead, read some data in advance of the current 403 copy position into a register. This should act like a PLD 404 operation but we won't have to repeat the transfer. */ 405 406 vldr d3, [src, #0] 407 vldr d4, [src, #64] 408 vldr d5, [src, #128] 409 vldr d6, [src, #192] 410 vldr d7, [src, #256] 411 412 vldr d0, [src, #8] 413 vldr d1, [src, #16] 414 vldr d2, [src, #24] 415 add src, src, #32 416 417 subs tmp2, tmp2, #prefetch_lines * 64 * 2 418 blt 2f 4191: 420 cpy_line_vfp d3, 0 421 cpy_line_vfp d4, 64 422 cpy_line_vfp d5, 128 423 add dst, dst, #3 * 64 424 add src, src, #3 * 64 425 cpy_line_vfp d6, 0 426 cpy_line_vfp d7, 64 427 add dst, dst, #2 * 64 428 add src, src, #2 * 64 429 subs tmp2, tmp2, #prefetch_lines * 64 430 bge 1b 431 4322: 433 cpy_tail_vfp d3, 0 434 cpy_tail_vfp d4, 64 435 cpy_tail_vfp d5, 128 436 add src, src, #3 * 64 437 add dst, dst, #3 * 64 438 cpy_tail_vfp d6, 0 439 vstr d7, [dst, #64] 440 vldr d7, [src, #64] 441 vstr d0, [dst, #64 + 8] 442 vldr d0, [src, #64 + 8] 443 vstr d1, [dst, #64 + 16] 444 vldr d1, [src, #64 + 16] 445 vstr d2, [dst, #64 + 24] 446 vldr d2, [src, #64 + 24] 447 vstr d7, [dst, #64 + 32] 448 add src, src, #96 449 vstr d0, [dst, #64 + 40] 450 vstr d1, [dst, #64 + 48] 451 vstr d2, [dst, #64 + 56] 452 add dst, dst, #128 453 add tmp2, tmp2, #prefetch_lines * 64 454 b .Lcpy_body_medium 455#else 456 /* Long copy. Use an SMS style loop to maximize the I/O 457 bandwidth of the core. We don't have enough spare registers 458 to synthesise prefetching, so use PLD operations. */ 459 /* Pre-bias src and dst. */ 460 sub src, src, #8 461 sub dst, dst, #8 462 pld [src, #8] 463 pld [src, #72] 464 subs tmp2, tmp2, #64 465 pld [src, #136] 466 ldrd A_l, A_h, [src, #8] 467 strd B_l, B_h, [sp, #8] 468 ldrd B_l, B_h, [src, #16] 469 strd C_l, C_h, [sp, #16] 470 ldrd C_l, C_h, [src, #24] 471 strd D_l, D_h, [sp, #24] 472 pld [src, #200] 473 ldrd D_l, D_h, [src, #32]! 474 b 1f 475 .p2align 6 4762: 477 pld [src, #232] 478 strd A_l, A_h, [dst, #40] 479 ldrd A_l, A_h, [src, #40] 480 strd B_l, B_h, [dst, #48] 481 ldrd B_l, B_h, [src, #48] 482 strd C_l, C_h, [dst, #56] 483 ldrd C_l, C_h, [src, #56] 484 strd D_l, D_h, [dst, #64]! 485 ldrd D_l, D_h, [src, #64]! 486 subs tmp2, tmp2, #64 4871: 488 strd A_l, A_h, [dst, #8] 489 ldrd A_l, A_h, [src, #8] 490 strd B_l, B_h, [dst, #16] 491 ldrd B_l, B_h, [src, #16] 492 strd C_l, C_h, [dst, #24] 493 ldrd C_l, C_h, [src, #24] 494 strd D_l, D_h, [dst, #32] 495 ldrd D_l, D_h, [src, #32] 496 bcs 2b 497 /* Save the remaining bytes and restore the callee-saved regs. */ 498 strd A_l, A_h, [dst, #40] 499 add src, src, #40 500 strd B_l, B_h, [dst, #48] 501 ldrd B_l, B_h, [sp, #8] 502 strd C_l, C_h, [dst, #56] 503 ldrd C_l, C_h, [sp, #16] 504 strd D_l, D_h, [dst, #64] 505 ldrd D_l, D_h, [sp, #24] 506 add dst, dst, #72 507 tst tmp2, #0x3f 508 bne .Ltail63aligned 509 ldr tmp2, [sp], #FRAME_SIZE 510 bx lr 511#endif 512 513.Lcpy_notaligned: 514 pld [src] 515 pld [src, #64] 516 /* There's at least 64 bytes to copy, but there is no mutual 517 alignment. */ 518 /* Bring DST to 64-bit alignment. */ 519 lsls tmp2, dst, #29 520 pld [src, #(2 * 64)] 521 beq 1f 522 rsbs tmp2, tmp2, #0 523 sub count, count, tmp2, lsr #29 524 ldrmi tmp1, [src], #4 525 strmi tmp1, [dst], #4 526 lsls tmp2, tmp2, #2 527 ldrbne tmp1, [src], #1 528 ldrhcs tmp2, [src], #2 529 strbne tmp1, [dst], #1 530 strhcs tmp2, [dst], #2 5311: 532 pld [src, #(3 * 64)] 533 subs count, count, #64 534 ldrmi tmp2, [sp], #FRAME_SIZE 535 bmi .Ltail63unaligned 536 pld [src, #(4 * 64)] 537 538#ifdef USE_NEON 539 vld1.8 {d0-d3}, [src]! 540 vld1.8 {d4-d7}, [src]! 541 subs count, count, #64 542 bmi 2f 5431: 544 pld [src, #(4 * 64)] 545 vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 546 vld1.8 {d0-d3}, [src]! 547 vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 548 vld1.8 {d4-d7}, [src]! 549 subs count, count, #64 550 bpl 1b 5512: 552 vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 553 vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 554 ands count, count, #0x3f 555#else 556 /* Use an SMS style loop to maximize the I/O bandwidth. */ 557 sub src, src, #4 558 sub dst, dst, #8 559 subs tmp2, count, #64 /* Use tmp2 for count. */ 560 ldr A_l, [src, #4] 561 ldr A_h, [src, #8] 562 strd B_l, B_h, [sp, #8] 563 ldr B_l, [src, #12] 564 ldr B_h, [src, #16] 565 strd C_l, C_h, [sp, #16] 566 ldr C_l, [src, #20] 567 ldr C_h, [src, #24] 568 strd D_l, D_h, [sp, #24] 569 ldr D_l, [src, #28] 570 ldr D_h, [src, #32]! 571 b 1f 572 .p2align 6 5732: 574 pld [src, #(5 * 64) - (32 - 4)] 575 strd A_l, A_h, [dst, #40] 576 ldr A_l, [src, #36] 577 ldr A_h, [src, #40] 578 strd B_l, B_h, [dst, #48] 579 ldr B_l, [src, #44] 580 ldr B_h, [src, #48] 581 strd C_l, C_h, [dst, #56] 582 ldr C_l, [src, #52] 583 ldr C_h, [src, #56] 584 strd D_l, D_h, [dst, #64]! 585 ldr D_l, [src, #60] 586 ldr D_h, [src, #64]! 587 subs tmp2, tmp2, #64 5881: 589 strd A_l, A_h, [dst, #8] 590 ldr A_l, [src, #4] 591 ldr A_h, [src, #8] 592 strd B_l, B_h, [dst, #16] 593 ldr B_l, [src, #12] 594 ldr B_h, [src, #16] 595 strd C_l, C_h, [dst, #24] 596 ldr C_l, [src, #20] 597 ldr C_h, [src, #24] 598 strd D_l, D_h, [dst, #32] 599 ldr D_l, [src, #28] 600 ldr D_h, [src, #32] 601 bcs 2b 602 603 /* Save the remaining bytes and restore the callee-saved regs. */ 604 strd A_l, A_h, [dst, #40] 605 add src, src, #36 606 strd B_l, B_h, [dst, #48] 607 ldrd B_l, B_h, [sp, #8] 608 strd C_l, C_h, [dst, #56] 609 ldrd C_l, C_h, [sp, #16] 610 strd D_l, D_h, [dst, #64] 611 ldrd D_l, D_h, [sp, #24] 612 add dst, dst, #72 613 ands count, tmp2, #0x3f 614#endif 615 ldr tmp2, [sp], #FRAME_SIZE 616 bne .Ltail63unaligned 617 bx lr 618 619 .size memcpy, . - memcpy 620