1/* 2 * Copyright (c) 2012-2015 3 * MIPS Technologies, Inc., California. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its 14 * contributors may be used to endorse or promote products derived from 15 * this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#ifdef ANDROID_CHANGES 31# include "machine/asm.h" 32# include "machine/regdef.h" 33# define USE_MEMMOVE_FOR_OVERLAP 34# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED 35# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE 36#elif _LIBC 37# include "machine/asm.h" 38# include "machine/regdef.h" 39# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED 40# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE 41#else 42# include <regdef.h> 43# include <sys/asm.h> 44#endif 45 46/* Check to see if the MIPS architecture we are compiling for supports 47 * prefetching. 48 */ 49 50#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64) 51# ifndef DISABLE_PREFETCH 52# define USE_PREFETCH 53# endif 54#endif 55 56#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)) 57# ifndef DISABLE_DOUBLE 58# define USE_DOUBLE 59# endif 60#endif 61 62 63#if __mips_isa_rev > 5 64# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 65# undef PREFETCH_STORE_HINT 66# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED 67# endif 68# define R6_CODE 69#endif 70 71/* Some asm.h files do not have the L macro definition. */ 72#ifndef L 73# if _MIPS_SIM == _ABIO32 74# define L(label) $L ## label 75# else 76# define L(label) .L ## label 77# endif 78#endif 79 80/* Some asm.h files do not have the PTR_ADDIU macro definition. */ 81#ifndef PTR_ADDIU 82# ifdef USE_DOUBLE 83# define PTR_ADDIU daddiu 84# else 85# define PTR_ADDIU addiu 86# endif 87#endif 88 89/* Some asm.h files do not have the PTR_SRA macro definition. */ 90#ifndef PTR_SRA 91# ifdef USE_DOUBLE 92# define PTR_SRA dsra 93# else 94# define PTR_SRA sra 95# endif 96#endif 97 98/* New R6 instructions that may not be in asm.h. */ 99#ifndef PTR_LSA 100# if _MIPS_SIM == _ABI64 101# define PTR_LSA dlsa 102# else 103# define PTR_LSA lsa 104# endif 105#endif 106 107/* 108 * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load 109 * prefetches appears to offer a slight preformance advantage. 110 * 111 * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE 112 * or PREFETCH_STORE_STREAMED offers a large performance advantage 113 * but PREPAREFORSTORE has some special restrictions to consider. 114 * 115 * Prefetch with the 'prepare for store' hint does not copy a memory 116 * location into the cache, it just allocates a cache line and zeros 117 * it out. This means that if you do not write to the entire cache 118 * line before writing it out to memory some data will get zero'ed out 119 * when the cache line is written back to memory and data will be lost. 120 * 121 * Also if you are using this memcpy to copy overlapping buffers it may 122 * not behave correctly when using the 'prepare for store' hint. If you 123 * use the 'prepare for store' prefetch on a memory area that is in the 124 * memcpy source (as well as the memcpy destination), then you will get 125 * some data zero'ed out before you have a chance to read it and data will 126 * be lost. 127 * 128 * If you are going to use this memcpy routine with the 'prepare for store' 129 * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid 130 * the problem of running memcpy on overlapping buffers. 131 * 132 * There are ifdef'ed sections of this memcpy to make sure that it does not 133 * do prefetches on cache lines that are not going to be completely written. 134 * This code is only needed and only used when PREFETCH_STORE_HINT is set to 135 * PREFETCH_HINT_PREPAREFORSTORE. This code assumes that cache lines are 136 * 32 bytes and if the cache line is larger it will not work correctly. 137 */ 138 139#ifdef USE_PREFETCH 140# define PREFETCH_HINT_LOAD 0 141# define PREFETCH_HINT_STORE 1 142# define PREFETCH_HINT_LOAD_STREAMED 4 143# define PREFETCH_HINT_STORE_STREAMED 5 144# define PREFETCH_HINT_LOAD_RETAINED 6 145# define PREFETCH_HINT_STORE_RETAINED 7 146# define PREFETCH_HINT_WRITEBACK_INVAL 25 147# define PREFETCH_HINT_PREPAREFORSTORE 30 148 149/* 150 * If we have not picked out what hints to use at this point use the 151 * standard load and store prefetch hints. 152 */ 153# ifndef PREFETCH_STORE_HINT 154# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE 155# endif 156# ifndef PREFETCH_LOAD_HINT 157# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD 158# endif 159 160/* 161 * We double everything when USE_DOUBLE is true so we do 2 prefetches to 162 * get 64 bytes in that case. The assumption is that each individual 163 * prefetch brings in 32 bytes. 164 */ 165 166# ifdef USE_DOUBLE 167# define PREFETCH_CHUNK 64 168# define PREFETCH_FOR_LOAD(chunk, reg) \ 169 pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \ 170 pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg) 171# define PREFETCH_FOR_STORE(chunk, reg) \ 172 pref PREFETCH_STORE_HINT, (chunk)*64(reg); \ 173 pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg) 174# else 175# define PREFETCH_CHUNK 32 176# define PREFETCH_FOR_LOAD(chunk, reg) \ 177 pref PREFETCH_LOAD_HINT, (chunk)*32(reg) 178# define PREFETCH_FOR_STORE(chunk, reg) \ 179 pref PREFETCH_STORE_HINT, (chunk)*32(reg) 180# endif 181/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less 182 * than PREFETCH_CHUNK, the assumed size of each prefetch. If the real size 183 * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE 184 * hint is used, the code will not work correctly. If PREPAREFORSTORE is not 185 * used then MAX_PREFETCH_SIZE does not matter. */ 186# define MAX_PREFETCH_SIZE 128 187/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater 188 * than 5 on a STORE prefetch and that a single prefetch can never be larger 189 * than MAX_PREFETCH_SIZE. We add the extra 32 when USE_DOUBLE is set because 190 * we actually do two prefetches in that case, one 32 bytes after the other. */ 191# ifdef USE_DOUBLE 192# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE 193# else 194# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE 195# endif 196# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \ 197 && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE) 198/* We cannot handle this because the initial prefetches may fetch bytes that 199 * are before the buffer being copied. We start copies with an offset 200 * of 4 so avoid this situation when using PREPAREFORSTORE. */ 201#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small." 202# endif 203#else /* USE_PREFETCH not defined */ 204# define PREFETCH_FOR_LOAD(offset, reg) 205# define PREFETCH_FOR_STORE(offset, reg) 206#endif 207 208/* Allow the routine to be named something else if desired. */ 209#ifndef MEMCPY_NAME 210# define MEMCPY_NAME memcpy 211#endif 212 213/* We use these 32/64 bit registers as temporaries to do the copying. */ 214#define REG0 t0 215#define REG1 t1 216#define REG2 t2 217#define REG3 t3 218#if defined(_MIPS_SIM) && (_MIPS_SIM == _ABIO32 || _MIPS_SIM == _ABIO64) 219# define REG4 t4 220# define REG5 t5 221# define REG6 t6 222# define REG7 t7 223#else 224# define REG4 ta0 225# define REG5 ta1 226# define REG6 ta2 227# define REG7 ta3 228#endif 229 230/* We load/store 64 bits at a time when USE_DOUBLE is true. 231 * The C_ prefix stands for CHUNK and is used to avoid macro name 232 * conflicts with system header files. */ 233 234#ifdef USE_DOUBLE 235# define C_ST sd 236# define C_LD ld 237# if __MIPSEB 238# define C_LDHI ldl /* high part is left in big-endian */ 239# define C_STHI sdl /* high part is left in big-endian */ 240# define C_LDLO ldr /* low part is right in big-endian */ 241# define C_STLO sdr /* low part is right in big-endian */ 242# else 243# define C_LDHI ldr /* high part is right in little-endian */ 244# define C_STHI sdr /* high part is right in little-endian */ 245# define C_LDLO ldl /* low part is left in little-endian */ 246# define C_STLO sdl /* low part is left in little-endian */ 247# endif 248# define C_ALIGN dalign /* r6 align instruction */ 249#else 250# define C_ST sw 251# define C_LD lw 252# if __MIPSEB 253# define C_LDHI lwl /* high part is left in big-endian */ 254# define C_STHI swl /* high part is left in big-endian */ 255# define C_LDLO lwr /* low part is right in big-endian */ 256# define C_STLO swr /* low part is right in big-endian */ 257# else 258# define C_LDHI lwr /* high part is right in little-endian */ 259# define C_STHI swr /* high part is right in little-endian */ 260# define C_LDLO lwl /* low part is left in little-endian */ 261# define C_STLO swl /* low part is left in little-endian */ 262# endif 263# define C_ALIGN align /* r6 align instruction */ 264#endif 265 266/* Bookkeeping values for 32 vs. 64 bit mode. */ 267#ifdef USE_DOUBLE 268# define NSIZE 8 269# define NSIZEMASK 0x3f 270# define NSIZEDMASK 0x7f 271#else 272# define NSIZE 4 273# define NSIZEMASK 0x1f 274# define NSIZEDMASK 0x3f 275#endif 276#define UNIT(unit) ((unit)*NSIZE) 277#define UNITM1(unit) (((unit)*NSIZE)-1) 278 279#ifdef ANDROID_CHANGES 280LEAF(MEMCPY_NAME, 0) 281#else 282LEAF(MEMCPY_NAME) 283#endif 284 .set nomips16 285 .set noreorder 286/* 287 * Below we handle the case where memcpy is called with overlapping src and dst. 288 * Although memcpy is not required to handle this case, some parts of Android 289 * like Skia rely on such usage. We call memmove to handle such cases. 290 */ 291#ifdef USE_MEMMOVE_FOR_OVERLAP 292 PTR_SUBU t0,a0,a1 293 PTR_SRA t2,t0,31 294 xor t1,t0,t2 295 PTR_SUBU t0,t1,t2 296 sltu t2,t0,a2 297 beq t2,zero,L(memcpy) 298 la t9,memmove 299 jr t9 300 nop 301L(memcpy): 302#endif 303/* 304 * If the size is less than 2*NSIZE (8 or 16), go to L(lastb). Regardless of 305 * size, copy dst pointer to v0 for the return value. 306 */ 307 slti t2,a2,(2 * NSIZE) 308 bne t2,zero,L(lasts) 309#if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH) 310 move v0,zero 311#else 312 move v0,a0 313#endif 314 315#ifndef R6_CODE 316 317/* 318 * If src and dst have different alignments, go to L(unaligned), if they 319 * have the same alignment (but are not actually aligned) do a partial 320 * load/store to make them aligned. If they are both already aligned 321 * we can start copying at L(aligned). 322 */ 323 xor t8,a1,a0 324 andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */ 325 bne t8,zero,L(unaligned) 326 PTR_SUBU a3, zero, a0 327 328 andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */ 329 beq a3,zero,L(aligned) /* if a3=0, it is already aligned */ 330 PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */ 331 332 C_LDHI t8,0(a1) 333 PTR_ADDU a1,a1,a3 334 C_STHI t8,0(a0) 335 PTR_ADDU a0,a0,a3 336 337#else /* R6_CODE */ 338 339/* 340 * Align the destination and hope that the source gets aligned too. If it 341 * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6 342 * align instruction. 343 */ 344 andi t8,a0,7 345 lapc t9,L(atable) 346 PTR_LSA t9,t8,t9,2 347 jrc t9 348L(atable): 349 bc L(lb0) 350 bc L(lb7) 351 bc L(lb6) 352 bc L(lb5) 353 bc L(lb4) 354 bc L(lb3) 355 bc L(lb2) 356 bc L(lb1) 357L(lb7): 358 lb a3, 6(a1) 359 sb a3, 6(a0) 360L(lb6): 361 lb a3, 5(a1) 362 sb a3, 5(a0) 363L(lb5): 364 lb a3, 4(a1) 365 sb a3, 4(a0) 366L(lb4): 367 lb a3, 3(a1) 368 sb a3, 3(a0) 369L(lb3): 370 lb a3, 2(a1) 371 sb a3, 2(a0) 372L(lb2): 373 lb a3, 1(a1) 374 sb a3, 1(a0) 375L(lb1): 376 lb a3, 0(a1) 377 sb a3, 0(a0) 378 379 li t9,8 380 subu t8,t9,t8 381 PTR_SUBU a2,a2,t8 382 PTR_ADDU a0,a0,t8 383 PTR_ADDU a1,a1,t8 384L(lb0): 385 386 andi t8,a1,(NSIZE-1) 387 lapc t9,L(jtable) 388 PTR_LSA t9,t8,t9,2 389 jrc t9 390L(jtable): 391 bc L(aligned) 392 bc L(r6_unaligned1) 393 bc L(r6_unaligned2) 394 bc L(r6_unaligned3) 395# ifdef USE_DOUBLE 396 bc L(r6_unaligned4) 397 bc L(r6_unaligned5) 398 bc L(r6_unaligned6) 399 bc L(r6_unaligned7) 400# endif 401#endif /* R6_CODE */ 402 403L(aligned): 404 405/* 406 * Now dst/src are both aligned to (word or double word) aligned addresses 407 * Set a2 to count how many bytes we have to copy after all the 64/128 byte 408 * chunks are copied and a3 to the dst pointer after all the 64/128 byte 409 * chunks have been copied. We will loop, incrementing a0 and a1 until a0 410 * equals a3. 411 */ 412 413 andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */ 414 beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */ 415 PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */ 416 PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */ 417 418/* When in the loop we may prefetch with the 'prepare to store' hint, 419 * in this case the a0+x should not be past the "t0-32" address. This 420 * means: for x=128 the last "safe" a0 address is "t0-160". Alternatively, 421 * for x=64 the last "safe" a0 address is "t0-96" In the current version we 422 * will use "prefetch hint,128(a0)", so "t0-160" is the limit. 423 */ 424#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 425 PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */ 426 PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */ 427#endif 428 PREFETCH_FOR_LOAD (0, a1) 429 PREFETCH_FOR_LOAD (1, a1) 430 PREFETCH_FOR_LOAD (2, a1) 431 PREFETCH_FOR_LOAD (3, a1) 432#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) 433 PREFETCH_FOR_STORE (1, a0) 434 PREFETCH_FOR_STORE (2, a0) 435 PREFETCH_FOR_STORE (3, a0) 436#endif 437#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH) 438# if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE 439 sltu v1,t9,a0 440 bgtz v1,L(skip_set) 441 nop 442 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4) 443L(skip_set): 444# else 445 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1) 446# endif 447#endif 448#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \ 449 && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) 450 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3) 451# ifdef USE_DOUBLE 452 PTR_ADDIU v0,v0,32 453# endif 454#endif 455L(loop16w): 456 C_LD t0,UNIT(0)(a1) 457#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 458 sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */ 459 bgtz v1,L(skip_pref) 460#endif 461 C_LD t1,UNIT(1)(a1) 462#ifndef R6_CODE 463 PREFETCH_FOR_STORE (4, a0) 464 PREFETCH_FOR_STORE (5, a0) 465#else 466 PREFETCH_FOR_STORE (2, a0) 467#endif 468#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) 469 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5) 470# ifdef USE_DOUBLE 471 PTR_ADDIU v0,v0,32 472# endif 473#endif 474L(skip_pref): 475 C_LD REG2,UNIT(2)(a1) 476 C_LD REG3,UNIT(3)(a1) 477 C_LD REG4,UNIT(4)(a1) 478 C_LD REG5,UNIT(5)(a1) 479 C_LD REG6,UNIT(6)(a1) 480 C_LD REG7,UNIT(7)(a1) 481#ifndef R6_CODE 482 PREFETCH_FOR_LOAD (4, a1) 483#else 484 PREFETCH_FOR_LOAD (3, a1) 485#endif 486 C_ST t0,UNIT(0)(a0) 487 C_ST t1,UNIT(1)(a0) 488 C_ST REG2,UNIT(2)(a0) 489 C_ST REG3,UNIT(3)(a0) 490 C_ST REG4,UNIT(4)(a0) 491 C_ST REG5,UNIT(5)(a0) 492 C_ST REG6,UNIT(6)(a0) 493 C_ST REG7,UNIT(7)(a0) 494 495 C_LD t0,UNIT(8)(a1) 496 C_LD t1,UNIT(9)(a1) 497 C_LD REG2,UNIT(10)(a1) 498 C_LD REG3,UNIT(11)(a1) 499 C_LD REG4,UNIT(12)(a1) 500 C_LD REG5,UNIT(13)(a1) 501 C_LD REG6,UNIT(14)(a1) 502 C_LD REG7,UNIT(15)(a1) 503#ifndef R6_CODE 504 PREFETCH_FOR_LOAD (5, a1) 505#endif 506 C_ST t0,UNIT(8)(a0) 507 C_ST t1,UNIT(9)(a0) 508 C_ST REG2,UNIT(10)(a0) 509 C_ST REG3,UNIT(11)(a0) 510 C_ST REG4,UNIT(12)(a0) 511 C_ST REG5,UNIT(13)(a0) 512 C_ST REG6,UNIT(14)(a0) 513 C_ST REG7,UNIT(15)(a0) 514 PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ 515 bne a0,a3,L(loop16w) 516 PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */ 517 move a2,t8 518 519/* Here we have src and dest word-aligned but less than 64-bytes or 520 * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there 521 * is one. Otherwise jump down to L(chk1w) to handle the tail end of 522 * the copy. 523 */ 524 525L(chkw): 526 PREFETCH_FOR_LOAD (0, a1) 527 andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */ 528 /* The t8 is the reminder count past 32-bytes */ 529 beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */ 530 nop 531 C_LD t0,UNIT(0)(a1) 532 C_LD t1,UNIT(1)(a1) 533 C_LD REG2,UNIT(2)(a1) 534 C_LD REG3,UNIT(3)(a1) 535 C_LD REG4,UNIT(4)(a1) 536 C_LD REG5,UNIT(5)(a1) 537 C_LD REG6,UNIT(6)(a1) 538 C_LD REG7,UNIT(7)(a1) 539 PTR_ADDIU a1,a1,UNIT(8) 540 C_ST t0,UNIT(0)(a0) 541 C_ST t1,UNIT(1)(a0) 542 C_ST REG2,UNIT(2)(a0) 543 C_ST REG3,UNIT(3)(a0) 544 C_ST REG4,UNIT(4)(a0) 545 C_ST REG5,UNIT(5)(a0) 546 C_ST REG6,UNIT(6)(a0) 547 C_ST REG7,UNIT(7)(a0) 548 PTR_ADDIU a0,a0,UNIT(8) 549 550/* 551 * Here we have less than 32(64) bytes to copy. Set up for a loop to 552 * copy one word (or double word) at a time. Set a2 to count how many 553 * bytes we have to copy after all the word (or double word) chunks are 554 * copied and a3 to the dst pointer after all the (d)word chunks have 555 * been copied. We will loop, incrementing a0 and a1 until a0 equals a3. 556 */ 557L(chk1w): 558 andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */ 559 beq a2,t8,L(lastw) 560 PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */ 561 PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */ 562 563/* copying in words (4-byte or 8-byte chunks) */ 564L(wordCopy_loop): 565 C_LD REG3,UNIT(0)(a1) 566 PTR_ADDIU a0,a0,UNIT(1) 567 PTR_ADDIU a1,a1,UNIT(1) 568 bne a0,a3,L(wordCopy_loop) 569 C_ST REG3,UNIT(-1)(a0) 570 571/* If we have been copying double words, see if we can copy a single word 572 before doing byte copies. We can have, at most, one word to copy. */ 573 574L(lastw): 575#ifdef USE_DOUBLE 576 andi t8,a2,3 /* a2 is the remainder past 4 byte chunks. */ 577 beq t8,a2,L(lastb) 578 move a2,t8 579 lw REG3,0(a1) 580 sw REG3,0(a0) 581 PTR_ADDIU a0,a0,4 582 PTR_ADDIU a1,a1,4 583#endif 584 585/* Copy the last 8 (or 16) bytes */ 586L(lastb): 587 blez a2,L(leave) 588 PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ 589L(lastbloop): 590 lb v1,0(a1) 591 PTR_ADDIU a0,a0,1 592 PTR_ADDIU a1,a1,1 593 bne a0,a3,L(lastbloop) 594 sb v1,-1(a0) 595L(leave): 596 j ra 597 nop 598 599/* We jump here with a memcpy of less than 8 or 16 bytes, depending on 600 whether or not USE_DOUBLE is defined. Instead of just doing byte 601 copies, check the alignment and size and use lw/sw if possible. 602 Otherwise, do byte copies. */ 603 604L(lasts): 605 andi t8,a2,3 606 beq t8,a2,L(lastb) 607 608 andi t9,a0,3 609 bne t9,zero,L(lastb) 610 andi t9,a1,3 611 bne t9,zero,L(lastb) 612 613 PTR_SUBU a3,a2,t8 614 PTR_ADDU a3,a0,a3 615 616L(wcopy_loop): 617 lw REG3,0(a1) 618 PTR_ADDIU a0,a0,4 619 PTR_ADDIU a1,a1,4 620 bne a0,a3,L(wcopy_loop) 621 sw REG3,-4(a0) 622 623 b L(lastb) 624 move a2,t8 625 626#ifndef R6_CODE 627/* 628 * UNALIGNED case, got here with a3 = "negu a0" 629 * This code is nearly identical to the aligned code above 630 * but only the destination (not the source) gets aligned 631 * so we need to do partial loads of the source followed 632 * by normal stores to the destination (once we have aligned 633 * the destination). 634 */ 635 636L(unaligned): 637 andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */ 638 beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */ 639 PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */ 640 641 C_LDHI v1,UNIT(0)(a1) 642 C_LDLO v1,UNITM1(1)(a1) 643 PTR_ADDU a1,a1,a3 644 C_STHI v1,UNIT(0)(a0) 645 PTR_ADDU a0,a0,a3 646 647/* 648 * Now the destination (but not the source) is aligned 649 * Set a2 to count how many bytes we have to copy after all the 64/128 byte 650 * chunks are copied and a3 to the dst pointer after all the 64/128 byte 651 * chunks have been copied. We will loop, incrementing a0 and a1 until a0 652 * equals a3. 653 */ 654 655L(ua_chk16w): 656 andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */ 657 beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */ 658 PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */ 659 PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */ 660 661# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 662 PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */ 663 PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */ 664# endif 665 PREFETCH_FOR_LOAD (0, a1) 666 PREFETCH_FOR_LOAD (1, a1) 667 PREFETCH_FOR_LOAD (2, a1) 668# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) 669 PREFETCH_FOR_STORE (1, a0) 670 PREFETCH_FOR_STORE (2, a0) 671 PREFETCH_FOR_STORE (3, a0) 672# endif 673# if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH) 674# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 675 sltu v1,t9,a0 676 bgtz v1,L(ua_skip_set) 677 nop 678 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4) 679L(ua_skip_set): 680# else 681 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1) 682# endif 683# endif 684L(ua_loop16w): 685 PREFETCH_FOR_LOAD (3, a1) 686 C_LDHI t0,UNIT(0)(a1) 687 C_LDHI t1,UNIT(1)(a1) 688 C_LDHI REG2,UNIT(2)(a1) 689# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 690 sltu v1,t9,a0 691 bgtz v1,L(ua_skip_pref) 692# endif 693 C_LDHI REG3,UNIT(3)(a1) 694 PREFETCH_FOR_STORE (4, a0) 695 PREFETCH_FOR_STORE (5, a0) 696L(ua_skip_pref): 697 C_LDHI REG4,UNIT(4)(a1) 698 C_LDHI REG5,UNIT(5)(a1) 699 C_LDHI REG6,UNIT(6)(a1) 700 C_LDHI REG7,UNIT(7)(a1) 701 C_LDLO t0,UNITM1(1)(a1) 702 C_LDLO t1,UNITM1(2)(a1) 703 C_LDLO REG2,UNITM1(3)(a1) 704 C_LDLO REG3,UNITM1(4)(a1) 705 C_LDLO REG4,UNITM1(5)(a1) 706 C_LDLO REG5,UNITM1(6)(a1) 707 C_LDLO REG6,UNITM1(7)(a1) 708 C_LDLO REG7,UNITM1(8)(a1) 709 PREFETCH_FOR_LOAD (4, a1) 710 C_ST t0,UNIT(0)(a0) 711 C_ST t1,UNIT(1)(a0) 712 C_ST REG2,UNIT(2)(a0) 713 C_ST REG3,UNIT(3)(a0) 714 C_ST REG4,UNIT(4)(a0) 715 C_ST REG5,UNIT(5)(a0) 716 C_ST REG6,UNIT(6)(a0) 717 C_ST REG7,UNIT(7)(a0) 718 C_LDHI t0,UNIT(8)(a1) 719 C_LDHI t1,UNIT(9)(a1) 720 C_LDHI REG2,UNIT(10)(a1) 721 C_LDHI REG3,UNIT(11)(a1) 722 C_LDHI REG4,UNIT(12)(a1) 723 C_LDHI REG5,UNIT(13)(a1) 724 C_LDHI REG6,UNIT(14)(a1) 725 C_LDHI REG7,UNIT(15)(a1) 726 C_LDLO t0,UNITM1(9)(a1) 727 C_LDLO t1,UNITM1(10)(a1) 728 C_LDLO REG2,UNITM1(11)(a1) 729 C_LDLO REG3,UNITM1(12)(a1) 730 C_LDLO REG4,UNITM1(13)(a1) 731 C_LDLO REG5,UNITM1(14)(a1) 732 C_LDLO REG6,UNITM1(15)(a1) 733 C_LDLO REG7,UNITM1(16)(a1) 734 PREFETCH_FOR_LOAD (5, a1) 735 C_ST t0,UNIT(8)(a0) 736 C_ST t1,UNIT(9)(a0) 737 C_ST REG2,UNIT(10)(a0) 738 C_ST REG3,UNIT(11)(a0) 739 C_ST REG4,UNIT(12)(a0) 740 C_ST REG5,UNIT(13)(a0) 741 C_ST REG6,UNIT(14)(a0) 742 C_ST REG7,UNIT(15)(a0) 743 PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ 744 bne a0,a3,L(ua_loop16w) 745 PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */ 746 move a2,t8 747 748/* Here we have src and dest word-aligned but less than 64-bytes or 749 * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there 750 * is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of 751 * the copy. */ 752 753L(ua_chkw): 754 PREFETCH_FOR_LOAD (0, a1) 755 andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */ 756 /* t8 is the reminder count past 32-bytes */ 757 beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */ 758 nop 759 C_LDHI t0,UNIT(0)(a1) 760 C_LDHI t1,UNIT(1)(a1) 761 C_LDHI REG2,UNIT(2)(a1) 762 C_LDHI REG3,UNIT(3)(a1) 763 C_LDHI REG4,UNIT(4)(a1) 764 C_LDHI REG5,UNIT(5)(a1) 765 C_LDHI REG6,UNIT(6)(a1) 766 C_LDHI REG7,UNIT(7)(a1) 767 C_LDLO t0,UNITM1(1)(a1) 768 C_LDLO t1,UNITM1(2)(a1) 769 C_LDLO REG2,UNITM1(3)(a1) 770 C_LDLO REG3,UNITM1(4)(a1) 771 C_LDLO REG4,UNITM1(5)(a1) 772 C_LDLO REG5,UNITM1(6)(a1) 773 C_LDLO REG6,UNITM1(7)(a1) 774 C_LDLO REG7,UNITM1(8)(a1) 775 PTR_ADDIU a1,a1,UNIT(8) 776 C_ST t0,UNIT(0)(a0) 777 C_ST t1,UNIT(1)(a0) 778 C_ST REG2,UNIT(2)(a0) 779 C_ST REG3,UNIT(3)(a0) 780 C_ST REG4,UNIT(4)(a0) 781 C_ST REG5,UNIT(5)(a0) 782 C_ST REG6,UNIT(6)(a0) 783 C_ST REG7,UNIT(7)(a0) 784 PTR_ADDIU a0,a0,UNIT(8) 785/* 786 * Here we have less than 32(64) bytes to copy. Set up for a loop to 787 * copy one word (or double word) at a time. 788 */ 789L(ua_chk1w): 790 andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */ 791 beq a2,t8,L(ua_smallCopy) 792 PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */ 793 PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */ 794 795/* copying in words (4-byte or 8-byte chunks) */ 796L(ua_wordCopy_loop): 797 C_LDHI v1,UNIT(0)(a1) 798 C_LDLO v1,UNITM1(1)(a1) 799 PTR_ADDIU a0,a0,UNIT(1) 800 PTR_ADDIU a1,a1,UNIT(1) 801 bne a0,a3,L(ua_wordCopy_loop) 802 C_ST v1,UNIT(-1)(a0) 803 804/* Copy the last 8 (or 16) bytes */ 805L(ua_smallCopy): 806 beqz a2,L(leave) 807 PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ 808L(ua_smallCopy_loop): 809 lb v1,0(a1) 810 PTR_ADDIU a0,a0,1 811 PTR_ADDIU a1,a1,1 812 bne a0,a3,L(ua_smallCopy_loop) 813 sb v1,-1(a0) 814 815 j ra 816 nop 817 818#else /* R6_CODE */ 819 820# if __MIPSEB 821# define SWAP_REGS(X,Y) X, Y 822# define ALIGN_OFFSET(N) (N) 823# else 824# define SWAP_REGS(X,Y) Y, X 825# define ALIGN_OFFSET(N) (NSIZE-N) 826# endif 827# define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \ 828 andi REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes. */ \ 829 beq REG7, a2, L(lastb); /* Check for bytes to copy by word */ \ 830 PTR_SUBU a3, a2, REG7; /* a3 is number of bytes to be copied in */ \ 831 /* (d)word chunks. */ \ 832 move a2, REG7; /* a2 is # of bytes to copy byte by byte */ \ 833 /* after word loop is finished. */ \ 834 PTR_ADDU REG6, a0, a3; /* REG6 is the dst address after loop. */ \ 835 PTR_SUBU REG2, a1, t8; /* REG2 is the aligned src address. */ \ 836 PTR_ADDU a1, a1, a3; /* a1 is addr of source after word loop. */ \ 837 C_LD t0, UNIT(0)(REG2); /* Load first part of source. */ \ 838L(r6_ua_wordcopy##BYTEOFFSET): \ 839 C_LD t1, UNIT(1)(REG2); /* Load second part of source. */ \ 840 C_ALIGN REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET); \ 841 PTR_ADDIU a0, a0, UNIT(1); /* Increment destination pointer. */ \ 842 PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \ 843 move t0, t1; /* Move second part of source to first. */ \ 844 bne a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET); \ 845 C_ST REG3, UNIT(-1)(a0); \ 846 j L(lastb); \ 847 nop 848 849 /* We are generating R6 code, the destination is 4 byte aligned and 850 the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the 851 alignment of the source. */ 852 853L(r6_unaligned1): 854 R6_UNALIGNED_WORD_COPY(1) 855L(r6_unaligned2): 856 R6_UNALIGNED_WORD_COPY(2) 857L(r6_unaligned3): 858 R6_UNALIGNED_WORD_COPY(3) 859# ifdef USE_DOUBLE 860L(r6_unaligned4): 861 R6_UNALIGNED_WORD_COPY(4) 862L(r6_unaligned5): 863 R6_UNALIGNED_WORD_COPY(5) 864L(r6_unaligned6): 865 R6_UNALIGNED_WORD_COPY(6) 866L(r6_unaligned7): 867 R6_UNALIGNED_WORD_COPY(7) 868# endif 869#endif /* R6_CODE */ 870 871 .set at 872 .set reorder 873END(MEMCPY_NAME) 874