1/* Copyright (c) 2013, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 * Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 * Neither the name of Linaro Limited nor the names of its 16 contributors may be used to endorse or promote products derived 17 from this software without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 This memcpy routine is optimised for Cortex-A15 cores and takes advantage 32 of VFP or NEON when built with the appropriate flags. 33 34 Assumptions: 35 36 ARMv6 (ARMv7-a if using Neon) 37 ARM state 38 Unaligned accesses 39 LDRD/STRD support unaligned word accesses 40 41 If compiled with GCC, this file should be enclosed within following 42 pre-processing check: 43 if defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) 44 45 */ 46#include <picolibc.h> 47 48#include "arm_asm.h" 49 50 .syntax unified 51 /* This implementation requires ARM state. */ 52 .arm 53 54#ifdef __ARM_NEON__ 55 56 .fpu neon 57 .arch armv7-a 58# define FRAME_SIZE 4 59# define USE_VFP 60# define USE_NEON 61 62#elif __ARM_FP != 0 63 64 .arch armv6 65 .fpu vfpv2 66# define FRAME_SIZE 32 67# define USE_VFP 68 69#else 70 .arch armv6 71# define FRAME_SIZE 32 72 73#endif 74 75/* Old versions of GAS incorrectly implement the NEON align semantics. */ 76#ifdef BROKEN_ASM_NEON_ALIGN 77#define ALIGN(addr, align) addr,:align 78#else 79#define ALIGN(addr, align) addr:align 80#endif 81 82#define PC_OFFSET 8 /* PC pipeline compensation. */ 83#define INSN_SIZE 4 84 85/* Call parameters. */ 86#define dstin r0 87#define src r1 88#define count r2 89 90/* Locals. */ 91#define tmp1 r3 92#define dst ip 93#define tmp2 r10 94 95#ifndef USE_NEON 96/* For bulk copies using GP registers. */ 97#define A_l r2 /* Call-clobbered. */ 98#define A_h r3 /* Call-clobbered. */ 99#define B_l r4 100#define B_h r5 101#define C_l r6 102#define C_h r7 103#define D_l r8 104#define D_h r9 105#endif 106 107/* Number of lines ahead to pre-fetch data. If you change this the code 108 below will need adjustment to compensate. */ 109 110#define prefetch_lines 5 111 112#ifdef USE_VFP 113 .macro cpy_line_vfp vreg, base 114 vstr \vreg, [dst, #\base] 115 vldr \vreg, [src, #\base] 116 vstr d0, [dst, #\base + 8] 117 vldr d0, [src, #\base + 8] 118 vstr d1, [dst, #\base + 16] 119 vldr d1, [src, #\base + 16] 120 vstr d2, [dst, #\base + 24] 121 vldr d2, [src, #\base + 24] 122 vstr \vreg, [dst, #\base + 32] 123 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] 124 vstr d0, [dst, #\base + 40] 125 vldr d0, [src, #\base + 40] 126 vstr d1, [dst, #\base + 48] 127 vldr d1, [src, #\base + 48] 128 vstr d2, [dst, #\base + 56] 129 vldr d2, [src, #\base + 56] 130 .endm 131 132 .macro cpy_tail_vfp vreg, base 133 vstr \vreg, [dst, #\base] 134 vldr \vreg, [src, #\base] 135 vstr d0, [dst, #\base + 8] 136 vldr d0, [src, #\base + 8] 137 vstr d1, [dst, #\base + 16] 138 vldr d1, [src, #\base + 16] 139 vstr d2, [dst, #\base + 24] 140 vldr d2, [src, #\base + 24] 141 vstr \vreg, [dst, #\base + 32] 142 vstr d0, [dst, #\base + 40] 143 vldr d0, [src, #\base + 40] 144 vstr d1, [dst, #\base + 48] 145 vldr d1, [src, #\base + 48] 146 vstr d2, [dst, #\base + 56] 147 vldr d2, [src, #\base + 56] 148 .endm 149#endif 150 151 .macro def_fn f p2align=0 152 .text 153 .p2align \p2align 154 .global \f 155 .type \f, %function 156\f: 157 .endm 158 159def_fn memcpy p2align=6 160 ASM_ALIAS __aeabi_memcpy, memcpy 161 ASM_ALIAS __aeabi_memcpy4, memcpy 162 ASM_ALIAS __aeabi_memcpy8, memcpy 163 164 mov dst, dstin /* Preserve dstin, we need to return it. */ 165 cmp count, #64 166 bge .Lcpy_not_short 167 /* Deal with small copies quickly by dropping straight into the 168 exit block. */ 169 170.Ltail63unaligned: 171#ifdef USE_NEON 172 and tmp1, count, #0x38 173 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 174 add pc, pc, tmp1 175 vld1.8 {d0}, [src]! /* 14 words to go. */ 176 vst1.8 {d0}, [dst]! 177 vld1.8 {d0}, [src]! /* 12 words to go. */ 178 vst1.8 {d0}, [dst]! 179 vld1.8 {d0}, [src]! /* 10 words to go. */ 180 vst1.8 {d0}, [dst]! 181 vld1.8 {d0}, [src]! /* 8 words to go. */ 182 vst1.8 {d0}, [dst]! 183 vld1.8 {d0}, [src]! /* 6 words to go. */ 184 vst1.8 {d0}, [dst]! 185 vld1.8 {d0}, [src]! /* 4 words to go. */ 186 vst1.8 {d0}, [dst]! 187 vld1.8 {d0}, [src]! /* 2 words to go. */ 188 vst1.8 {d0}, [dst]! 189 190 tst count, #4 191 ldrne tmp1, [src], #4 192 strne tmp1, [dst], #4 193#else 194 /* Copy up to 15 full words of data. May not be aligned. */ 195 /* Cannot use VFP for unaligned data. */ 196 and tmp1, count, #0x3c 197 add dst, dst, tmp1 198 add src, src, tmp1 199 rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) 200 /* Jump directly into the sequence below at the correct offset. */ 201 add pc, pc, tmp1, lsl #1 202 203 ldr tmp1, [src, #-60] /* 15 words to go. */ 204 str tmp1, [dst, #-60] 205 206 ldr tmp1, [src, #-56] /* 14 words to go. */ 207 str tmp1, [dst, #-56] 208 ldr tmp1, [src, #-52] 209 str tmp1, [dst, #-52] 210 211 ldr tmp1, [src, #-48] /* 12 words to go. */ 212 str tmp1, [dst, #-48] 213 ldr tmp1, [src, #-44] 214 str tmp1, [dst, #-44] 215 216 ldr tmp1, [src, #-40] /* 10 words to go. */ 217 str tmp1, [dst, #-40] 218 ldr tmp1, [src, #-36] 219 str tmp1, [dst, #-36] 220 221 ldr tmp1, [src, #-32] /* 8 words to go. */ 222 str tmp1, [dst, #-32] 223 ldr tmp1, [src, #-28] 224 str tmp1, [dst, #-28] 225 226 ldr tmp1, [src, #-24] /* 6 words to go. */ 227 str tmp1, [dst, #-24] 228 ldr tmp1, [src, #-20] 229 str tmp1, [dst, #-20] 230 231 ldr tmp1, [src, #-16] /* 4 words to go. */ 232 str tmp1, [dst, #-16] 233 ldr tmp1, [src, #-12] 234 str tmp1, [dst, #-12] 235 236 ldr tmp1, [src, #-8] /* 2 words to go. */ 237 str tmp1, [dst, #-8] 238 ldr tmp1, [src, #-4] 239 str tmp1, [dst, #-4] 240#endif 241 242 lsls count, count, #31 243 ldrhcs tmp1, [src], #2 244 ldrbne src, [src] /* Src is dead, use as a scratch. */ 245 strhcs tmp1, [dst], #2 246 strbne src, [dst] 247 bx lr 248 249.Lcpy_not_short: 250 /* At least 64 bytes to copy, but don't know the alignment yet. */ 251 str tmp2, [sp, #-FRAME_SIZE]! 252 and tmp2, src, #7 253 and tmp1, dst, #7 254 cmp tmp1, tmp2 255 bne .Lcpy_notaligned 256 257#ifdef USE_VFP 258 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show 259 that the FP pipeline is much better at streaming loads and 260 stores. This is outside the critical loop. */ 261 vmov.f32 s0, s0 262#endif 263 264 /* SRC and DST have the same mutual 32-bit alignment, but we may 265 still need to pre-copy some bytes to get to natural alignment. 266 We bring DST into full 64-bit alignment. */ 267 lsls tmp2, dst, #29 268 beq 1f 269 rsbs tmp2, tmp2, #0 270 sub count, count, tmp2, lsr #29 271 ldrmi tmp1, [src], #4 272 strmi tmp1, [dst], #4 273 lsls tmp2, tmp2, #2 274 ldrhcs tmp1, [src], #2 275 ldrbne tmp2, [src], #1 276 strhcs tmp1, [dst], #2 277 strbne tmp2, [dst], #1 278 2791: 280 subs tmp2, count, #64 /* Use tmp2 for count. */ 281 blt .Ltail63aligned 282 283 cmp tmp2, #512 284 bge .Lcpy_body_long 285 286.Lcpy_body_medium: /* Count in tmp2. */ 287#ifdef USE_VFP 2881: 289 vldr d0, [src, #0] 290 subs tmp2, tmp2, #64 291 vldr d1, [src, #8] 292 vstr d0, [dst, #0] 293 vldr d0, [src, #16] 294 vstr d1, [dst, #8] 295 vldr d1, [src, #24] 296 vstr d0, [dst, #16] 297 vldr d0, [src, #32] 298 vstr d1, [dst, #24] 299 vldr d1, [src, #40] 300 vstr d0, [dst, #32] 301 vldr d0, [src, #48] 302 vstr d1, [dst, #40] 303 vldr d1, [src, #56] 304 vstr d0, [dst, #48] 305 add src, src, #64 306 vstr d1, [dst, #56] 307 add dst, dst, #64 308 bge 1b 309 tst tmp2, #0x3f 310 beq .Ldone 311 312.Ltail63aligned: /* Count in tmp2. */ 313 and tmp1, tmp2, #0x38 314 add dst, dst, tmp1 315 add src, src, tmp1 316 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 317 add pc, pc, tmp1 318 319 vldr d0, [src, #-56] /* 14 words to go. */ 320 vstr d0, [dst, #-56] 321 vldr d0, [src, #-48] /* 12 words to go. */ 322 vstr d0, [dst, #-48] 323 vldr d0, [src, #-40] /* 10 words to go. */ 324 vstr d0, [dst, #-40] 325 vldr d0, [src, #-32] /* 8 words to go. */ 326 vstr d0, [dst, #-32] 327 vldr d0, [src, #-24] /* 6 words to go. */ 328 vstr d0, [dst, #-24] 329 vldr d0, [src, #-16] /* 4 words to go. */ 330 vstr d0, [dst, #-16] 331 vldr d0, [src, #-8] /* 2 words to go. */ 332 vstr d0, [dst, #-8] 333#else 334 sub src, src, #8 335 sub dst, dst, #8 3361: 337 ldrd A_l, A_h, [src, #8] 338 strd A_l, A_h, [dst, #8] 339 ldrd A_l, A_h, [src, #16] 340 strd A_l, A_h, [dst, #16] 341 ldrd A_l, A_h, [src, #24] 342 strd A_l, A_h, [dst, #24] 343 ldrd A_l, A_h, [src, #32] 344 strd A_l, A_h, [dst, #32] 345 ldrd A_l, A_h, [src, #40] 346 strd A_l, A_h, [dst, #40] 347 ldrd A_l, A_h, [src, #48] 348 strd A_l, A_h, [dst, #48] 349 ldrd A_l, A_h, [src, #56] 350 strd A_l, A_h, [dst, #56] 351 ldrd A_l, A_h, [src, #64]! 352 strd A_l, A_h, [dst, #64]! 353 subs tmp2, tmp2, #64 354 bge 1b 355 tst tmp2, #0x3f 356 bne 1f 357 ldr tmp2,[sp], #FRAME_SIZE 358 bx lr 3591: 360 add src, src, #8 361 add dst, dst, #8 362 363.Ltail63aligned: /* Count in tmp2. */ 364 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but 365 we know that the src and dest are 32-bit aligned so we can use 366 LDRD/STRD to improve efficiency. */ 367 /* TMP2 is now negative, but we don't care about that. The bottom 368 six bits still tell us how many bytes are left to copy. */ 369 370 and tmp1, tmp2, #0x38 371 add dst, dst, tmp1 372 add src, src, tmp1 373 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 374 add pc, pc, tmp1 375 ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ 376 strd A_l, A_h, [dst, #-56] 377 ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ 378 strd A_l, A_h, [dst, #-48] 379 ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ 380 strd A_l, A_h, [dst, #-40] 381 ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ 382 strd A_l, A_h, [dst, #-32] 383 ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ 384 strd A_l, A_h, [dst, #-24] 385 ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ 386 strd A_l, A_h, [dst, #-16] 387 ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ 388 strd A_l, A_h, [dst, #-8] 389 390#endif 391 tst tmp2, #4 392 ldrne tmp1, [src], #4 393 strne tmp1, [dst], #4 394 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ 395 ldrhcs tmp1, [src], #2 396 ldrbne tmp2, [src] 397 strhcs tmp1, [dst], #2 398 strbne tmp2, [dst] 399 400.Ldone: 401 ldr tmp2, [sp], #FRAME_SIZE 402 bx lr 403 404.Lcpy_body_long: /* Count in tmp2. */ 405 406 /* Long copy. We know that there's at least (prefetch_lines * 64) 407 bytes to go. */ 408#ifdef USE_VFP 409 /* Don't use PLD. Instead, read some data in advance of the current 410 copy position into a register. This should act like a PLD 411 operation but we won't have to repeat the transfer. */ 412 413 vldr d3, [src, #0] 414 vldr d4, [src, #64] 415 vldr d5, [src, #128] 416 vldr d6, [src, #192] 417 vldr d7, [src, #256] 418 419 vldr d0, [src, #8] 420 vldr d1, [src, #16] 421 vldr d2, [src, #24] 422 add src, src, #32 423 424 subs tmp2, tmp2, #prefetch_lines * 64 * 2 425 blt 2f 4261: 427 cpy_line_vfp d3, 0 428 cpy_line_vfp d4, 64 429 cpy_line_vfp d5, 128 430 add dst, dst, #3 * 64 431 add src, src, #3 * 64 432 cpy_line_vfp d6, 0 433 cpy_line_vfp d7, 64 434 add dst, dst, #2 * 64 435 add src, src, #2 * 64 436 subs tmp2, tmp2, #prefetch_lines * 64 437 bge 1b 438 4392: 440 cpy_tail_vfp d3, 0 441 cpy_tail_vfp d4, 64 442 cpy_tail_vfp d5, 128 443 add src, src, #3 * 64 444 add dst, dst, #3 * 64 445 cpy_tail_vfp d6, 0 446 vstr d7, [dst, #64] 447 vldr d7, [src, #64] 448 vstr d0, [dst, #64 + 8] 449 vldr d0, [src, #64 + 8] 450 vstr d1, [dst, #64 + 16] 451 vldr d1, [src, #64 + 16] 452 vstr d2, [dst, #64 + 24] 453 vldr d2, [src, #64 + 24] 454 vstr d7, [dst, #64 + 32] 455 add src, src, #96 456 vstr d0, [dst, #64 + 40] 457 vstr d1, [dst, #64 + 48] 458 vstr d2, [dst, #64 + 56] 459 add dst, dst, #128 460 add tmp2, tmp2, #prefetch_lines * 64 461 b .Lcpy_body_medium 462#else 463 /* Long copy. Use an SMS style loop to maximize the I/O 464 bandwidth of the core. We don't have enough spare registers 465 to synthesise prefetching, so use PLD operations. */ 466 /* Pre-bias src and dst. */ 467 sub src, src, #8 468 sub dst, dst, #8 469 pld [src, #8] 470 pld [src, #72] 471 subs tmp2, tmp2, #64 472 pld [src, #136] 473 ldrd A_l, A_h, [src, #8] 474 strd B_l, B_h, [sp, #8] 475 ldrd B_l, B_h, [src, #16] 476 strd C_l, C_h, [sp, #16] 477 ldrd C_l, C_h, [src, #24] 478 strd D_l, D_h, [sp, #24] 479 pld [src, #200] 480 ldrd D_l, D_h, [src, #32]! 481 b 1f 482 .p2align 6 4832: 484 pld [src, #232] 485 strd A_l, A_h, [dst, #40] 486 ldrd A_l, A_h, [src, #40] 487 strd B_l, B_h, [dst, #48] 488 ldrd B_l, B_h, [src, #48] 489 strd C_l, C_h, [dst, #56] 490 ldrd C_l, C_h, [src, #56] 491 strd D_l, D_h, [dst, #64]! 492 ldrd D_l, D_h, [src, #64]! 493 subs tmp2, tmp2, #64 4941: 495 strd A_l, A_h, [dst, #8] 496 ldrd A_l, A_h, [src, #8] 497 strd B_l, B_h, [dst, #16] 498 ldrd B_l, B_h, [src, #16] 499 strd C_l, C_h, [dst, #24] 500 ldrd C_l, C_h, [src, #24] 501 strd D_l, D_h, [dst, #32] 502 ldrd D_l, D_h, [src, #32] 503 bcs 2b 504 /* Save the remaining bytes and restore the callee-saved regs. */ 505 strd A_l, A_h, [dst, #40] 506 add src, src, #40 507 strd B_l, B_h, [dst, #48] 508 ldrd B_l, B_h, [sp, #8] 509 strd C_l, C_h, [dst, #56] 510 ldrd C_l, C_h, [sp, #16] 511 strd D_l, D_h, [dst, #64] 512 ldrd D_l, D_h, [sp, #24] 513 add dst, dst, #72 514 tst tmp2, #0x3f 515 bne .Ltail63aligned 516 ldr tmp2, [sp], #FRAME_SIZE 517 bx lr 518#endif 519 520.Lcpy_notaligned: 521 pld [src] 522 pld [src, #64] 523 /* There's at least 64 bytes to copy, but there is no mutual 524 alignment. */ 525 /* Bring DST to 64-bit alignment. */ 526 lsls tmp2, dst, #29 527 pld [src, #(2 * 64)] 528 beq 1f 529 rsbs tmp2, tmp2, #0 530 sub count, count, tmp2, lsr #29 531 ldrmi tmp1, [src], #4 532 strmi tmp1, [dst], #4 533 lsls tmp2, tmp2, #2 534 ldrbne tmp1, [src], #1 535 ldrhcs tmp2, [src], #2 536 strbne tmp1, [dst], #1 537 strhcs tmp2, [dst], #2 5381: 539 pld [src, #(3 * 64)] 540 subs count, count, #64 541 ldrmi tmp2, [sp], #FRAME_SIZE 542 bmi .Ltail63unaligned 543 pld [src, #(4 * 64)] 544 545#ifdef USE_NEON 546 vld1.8 {d0-d3}, [src]! 547 vld1.8 {d4-d7}, [src]! 548 subs count, count, #64 549 bmi 2f 5501: 551 pld [src, #(4 * 64)] 552 vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 553 vld1.8 {d0-d3}, [src]! 554 vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 555 vld1.8 {d4-d7}, [src]! 556 subs count, count, #64 557 bpl 1b 5582: 559 vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 560 vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 561 ands count, count, #0x3f 562#else 563 /* Use an SMS style loop to maximize the I/O bandwidth. */ 564 sub src, src, #4 565 sub dst, dst, #8 566 subs tmp2, count, #64 /* Use tmp2 for count. */ 567 ldr A_l, [src, #4] 568 ldr A_h, [src, #8] 569 strd B_l, B_h, [sp, #8] 570 ldr B_l, [src, #12] 571 ldr B_h, [src, #16] 572 strd C_l, C_h, [sp, #16] 573 ldr C_l, [src, #20] 574 ldr C_h, [src, #24] 575 strd D_l, D_h, [sp, #24] 576 ldr D_l, [src, #28] 577 ldr D_h, [src, #32]! 578 b 1f 579 .p2align 6 5802: 581 pld [src, #(5 * 64) - (32 - 4)] 582 strd A_l, A_h, [dst, #40] 583 ldr A_l, [src, #36] 584 ldr A_h, [src, #40] 585 strd B_l, B_h, [dst, #48] 586 ldr B_l, [src, #44] 587 ldr B_h, [src, #48] 588 strd C_l, C_h, [dst, #56] 589 ldr C_l, [src, #52] 590 ldr C_h, [src, #56] 591 strd D_l, D_h, [dst, #64]! 592 ldr D_l, [src, #60] 593 ldr D_h, [src, #64]! 594 subs tmp2, tmp2, #64 5951: 596 strd A_l, A_h, [dst, #8] 597 ldr A_l, [src, #4] 598 ldr A_h, [src, #8] 599 strd B_l, B_h, [dst, #16] 600 ldr B_l, [src, #12] 601 ldr B_h, [src, #16] 602 strd C_l, C_h, [dst, #24] 603 ldr C_l, [src, #20] 604 ldr C_h, [src, #24] 605 strd D_l, D_h, [dst, #32] 606 ldr D_l, [src, #28] 607 ldr D_h, [src, #32] 608 bcs 2b 609 610 /* Save the remaining bytes and restore the callee-saved regs. */ 611 strd A_l, A_h, [dst, #40] 612 add src, src, #36 613 strd B_l, B_h, [dst, #48] 614 ldrd B_l, B_h, [sp, #8] 615 strd C_l, C_h, [dst, #56] 616 ldrd C_l, C_h, [sp, #16] 617 strd D_l, D_h, [dst, #64] 618 ldrd D_l, D_h, [sp, #24] 619 add dst, dst, #72 620 ands count, tmp2, #0x3f 621#endif 622 ldr tmp2, [sp], #FRAME_SIZE 623 bne .Ltail63unaligned 624 bx lr 625 626 .size memcpy, . - memcpy 627