1/* 2 * Copyright (c) 2012-2015 3 * MIPS Technologies, Inc., California. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its 14 * contributors may be used to endorse or promote products derived from 15 * this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include <picolibc.h> 31 32#ifdef ANDROID_CHANGES 33# include "machine/asm.h" 34# include "machine/regdef.h" 35# define USE_MEMMOVE_FOR_OVERLAP 36# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED 37# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE 38#elif _LIBC 39# include "machine/asm.h" 40# include "machine/regdef.h" 41# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED 42# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE 43#else 44# include <regdef.h> 45# include <sys/asm.h> 46#endif 47 48/* Check to see if the MIPS architecture we are compiling for supports 49 * prefetching. 50 */ 51 52#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64) 53# ifndef DISABLE_PREFETCH 54# define USE_PREFETCH 55# endif 56#endif 57 58#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)) 59# ifndef DISABLE_DOUBLE 60# define USE_DOUBLE 61# endif 62#endif 63 64 65#if __mips_isa_rev > 5 66# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 67# undef PREFETCH_STORE_HINT 68# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED 69# endif 70# define R6_CODE 71#endif 72 73/* Some asm.h files do not have the L macro definition. */ 74#ifndef L 75# if _MIPS_SIM == _ABIO32 76# define L(label) $L ## label 77# else 78# define L(label) .L ## label 79# endif 80#endif 81 82/* Some asm.h files do not have the PTR_ADDIU macro definition. */ 83#ifndef PTR_ADDIU 84# ifdef USE_DOUBLE 85# define PTR_ADDIU daddiu 86# else 87# define PTR_ADDIU addiu 88# endif 89#endif 90 91/* Some asm.h files do not have the PTR_SRA macro definition. */ 92#ifndef PTR_SRA 93# ifdef USE_DOUBLE 94# define PTR_SRA dsra 95# else 96# define PTR_SRA sra 97# endif 98#endif 99 100/* New R6 instructions that may not be in asm.h. */ 101#ifndef PTR_LSA 102# if _MIPS_SIM == _ABI64 103# define PTR_LSA dlsa 104# else 105# define PTR_LSA lsa 106# endif 107#endif 108 109/* 110 * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load 111 * prefetches appears to offer a slight preformance advantage. 112 * 113 * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE 114 * or PREFETCH_STORE_STREAMED offers a large performance advantage 115 * but PREPAREFORSTORE has some special restrictions to consider. 116 * 117 * Prefetch with the 'prepare for store' hint does not copy a memory 118 * location into the cache, it just allocates a cache line and zeros 119 * it out. This means that if you do not write to the entire cache 120 * line before writing it out to memory some data will get zero'ed out 121 * when the cache line is written back to memory and data will be lost. 122 * 123 * Also if you are using this memcpy to copy overlapping buffers it may 124 * not behave correctly when using the 'prepare for store' hint. If you 125 * use the 'prepare for store' prefetch on a memory area that is in the 126 * memcpy source (as well as the memcpy destination), then you will get 127 * some data zero'ed out before you have a chance to read it and data will 128 * be lost. 129 * 130 * If you are going to use this memcpy routine with the 'prepare for store' 131 * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid 132 * the problem of running memcpy on overlapping buffers. 133 * 134 * There are ifdef'ed sections of this memcpy to make sure that it does not 135 * do prefetches on cache lines that are not going to be completely written. 136 * This code is only needed and only used when PREFETCH_STORE_HINT is set to 137 * PREFETCH_HINT_PREPAREFORSTORE. This code assumes that cache lines are 138 * 32 bytes and if the cache line is larger it will not work correctly. 139 */ 140 141#ifdef USE_PREFETCH 142# define PREFETCH_HINT_LOAD 0 143# define PREFETCH_HINT_STORE 1 144# define PREFETCH_HINT_LOAD_STREAMED 4 145# define PREFETCH_HINT_STORE_STREAMED 5 146# define PREFETCH_HINT_LOAD_RETAINED 6 147# define PREFETCH_HINT_STORE_RETAINED 7 148# define PREFETCH_HINT_WRITEBACK_INVAL 25 149# define PREFETCH_HINT_PREPAREFORSTORE 30 150 151/* 152 * If we have not picked out what hints to use at this point use the 153 * standard load and store prefetch hints. 154 */ 155# ifndef PREFETCH_STORE_HINT 156# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE 157# endif 158# ifndef PREFETCH_LOAD_HINT 159# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD 160# endif 161 162/* 163 * We double everything when USE_DOUBLE is true so we do 2 prefetches to 164 * get 64 bytes in that case. The assumption is that each individual 165 * prefetch brings in 32 bytes. 166 */ 167 168# ifdef USE_DOUBLE 169# define PREFETCH_CHUNK 64 170# define PREFETCH_FOR_LOAD(chunk, reg) \ 171 pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \ 172 pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg) 173# define PREFETCH_FOR_STORE(chunk, reg) \ 174 pref PREFETCH_STORE_HINT, (chunk)*64(reg); \ 175 pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg) 176# else 177# define PREFETCH_CHUNK 32 178# define PREFETCH_FOR_LOAD(chunk, reg) \ 179 pref PREFETCH_LOAD_HINT, (chunk)*32(reg) 180# define PREFETCH_FOR_STORE(chunk, reg) \ 181 pref PREFETCH_STORE_HINT, (chunk)*32(reg) 182# endif 183/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less 184 * than PREFETCH_CHUNK, the assumed size of each prefetch. If the real size 185 * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE 186 * hint is used, the code will not work correctly. If PREPAREFORSTORE is not 187 * used then MAX_PREFETCH_SIZE does not matter. */ 188# define MAX_PREFETCH_SIZE 128 189/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater 190 * than 5 on a STORE prefetch and that a single prefetch can never be larger 191 * than MAX_PREFETCH_SIZE. We add the extra 32 when USE_DOUBLE is set because 192 * we actually do two prefetches in that case, one 32 bytes after the other. */ 193# ifdef USE_DOUBLE 194# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE 195# else 196# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE 197# endif 198# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \ 199 && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE) 200/* We cannot handle this because the initial prefetches may fetch bytes that 201 * are before the buffer being copied. We start copies with an offset 202 * of 4 so avoid this situation when using PREPAREFORSTORE. */ 203#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small." 204# endif 205#else /* USE_PREFETCH not defined */ 206# define PREFETCH_FOR_LOAD(offset, reg) 207# define PREFETCH_FOR_STORE(offset, reg) 208#endif 209 210/* Allow the routine to be named something else if desired. */ 211#ifndef MEMCPY_NAME 212# define MEMCPY_NAME memcpy 213#endif 214 215/* We use these 32/64 bit registers as temporaries to do the copying. */ 216#define REG0 t0 217#define REG1 t1 218#define REG2 t2 219#define REG3 t3 220#if defined(_MIPS_SIM) && (_MIPS_SIM == _ABIO32 || _MIPS_SIM == _ABIO64) 221# define REG4 t4 222# define REG5 t5 223# define REG6 t6 224# define REG7 t7 225#else 226# define REG4 ta0 227# define REG5 ta1 228# define REG6 ta2 229# define REG7 ta3 230#endif 231 232/* We load/store 64 bits at a time when USE_DOUBLE is true. 233 * The C_ prefix stands for CHUNK and is used to avoid macro name 234 * conflicts with system header files. */ 235 236#ifdef USE_DOUBLE 237# define C_ST sd 238# define C_LD ld 239# if __MIPSEB 240# define C_LDHI ldl /* high part is left in big-endian */ 241# define C_STHI sdl /* high part is left in big-endian */ 242# define C_LDLO ldr /* low part is right in big-endian */ 243# define C_STLO sdr /* low part is right in big-endian */ 244# else 245# define C_LDHI ldr /* high part is right in little-endian */ 246# define C_STHI sdr /* high part is right in little-endian */ 247# define C_LDLO ldl /* low part is left in little-endian */ 248# define C_STLO sdl /* low part is left in little-endian */ 249# endif 250# define C_ALIGN dalign /* r6 align instruction */ 251#else 252# define C_ST sw 253# define C_LD lw 254# if __MIPSEB 255# define C_LDHI lwl /* high part is left in big-endian */ 256# define C_STHI swl /* high part is left in big-endian */ 257# define C_LDLO lwr /* low part is right in big-endian */ 258# define C_STLO swr /* low part is right in big-endian */ 259# else 260# define C_LDHI lwr /* high part is right in little-endian */ 261# define C_STHI swr /* high part is right in little-endian */ 262# define C_LDLO lwl /* low part is left in little-endian */ 263# define C_STLO swl /* low part is left in little-endian */ 264# endif 265# define C_ALIGN align /* r6 align instruction */ 266#endif 267 268/* Bookkeeping values for 32 vs. 64 bit mode. */ 269#ifdef USE_DOUBLE 270# define NSIZE 8 271# define NSIZEMASK 0x3f 272# define NSIZEDMASK 0x7f 273#else 274# define NSIZE 4 275# define NSIZEMASK 0x1f 276# define NSIZEDMASK 0x3f 277#endif 278#define UNIT(unit) ((unit)*NSIZE) 279#define UNITM1(unit) (((unit)*NSIZE)-1) 280 281#ifdef ANDROID_CHANGES 282LEAF(MEMCPY_NAME, 0) 283#else 284LEAF(MEMCPY_NAME) 285#endif 286 .set nomips16 287 .set noreorder 288/* 289 * Below we handle the case where memcpy is called with overlapping src and dst. 290 * Although memcpy is not required to handle this case, some parts of Android 291 * like Skia rely on such usage. We call memmove to handle such cases. 292 */ 293#ifdef USE_MEMMOVE_FOR_OVERLAP 294 PTR_SUBU t0,a0,a1 295 PTR_SRA t2,t0,31 296 xor t1,t0,t2 297 PTR_SUBU t0,t1,t2 298 sltu t2,t0,a2 299 beq t2,zero,L(memcpy) 300 la t9,memmove 301 jr t9 302 nop 303L(memcpy): 304#endif 305/* 306 * If the size is less than 2*NSIZE (8 or 16), go to L(lastb). Regardless of 307 * size, copy dst pointer to v0 for the return value. 308 */ 309 slti t2,a2,(2 * NSIZE) 310 bne t2,zero,L(lasts) 311#if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH) 312 move v0,zero 313#else 314 move v0,a0 315#endif 316 317#ifndef R6_CODE 318 319/* 320 * If src and dst have different alignments, go to L(unaligned), if they 321 * have the same alignment (but are not actually aligned) do a partial 322 * load/store to make them aligned. If they are both already aligned 323 * we can start copying at L(aligned). 324 */ 325 xor t8,a1,a0 326 andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */ 327 bne t8,zero,L(unaligned) 328 PTR_SUBU a3, zero, a0 329 330 andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */ 331 beq a3,zero,L(aligned) /* if a3=0, it is already aligned */ 332 PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */ 333 334 C_LDHI t8,0(a1) 335 PTR_ADDU a1,a1,a3 336 C_STHI t8,0(a0) 337 PTR_ADDU a0,a0,a3 338 339#else /* R6_CODE */ 340 341/* 342 * Align the destination and hope that the source gets aligned too. If it 343 * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6 344 * align instruction. 345 */ 346 andi t8,a0,7 347 lapc t9,L(atable) 348 PTR_LSA t9,t8,t9,2 349 jrc t9 350L(atable): 351 bc L(lb0) 352 bc L(lb7) 353 bc L(lb6) 354 bc L(lb5) 355 bc L(lb4) 356 bc L(lb3) 357 bc L(lb2) 358 bc L(lb1) 359L(lb7): 360 lb a3, 6(a1) 361 sb a3, 6(a0) 362L(lb6): 363 lb a3, 5(a1) 364 sb a3, 5(a0) 365L(lb5): 366 lb a3, 4(a1) 367 sb a3, 4(a0) 368L(lb4): 369 lb a3, 3(a1) 370 sb a3, 3(a0) 371L(lb3): 372 lb a3, 2(a1) 373 sb a3, 2(a0) 374L(lb2): 375 lb a3, 1(a1) 376 sb a3, 1(a0) 377L(lb1): 378 lb a3, 0(a1) 379 sb a3, 0(a0) 380 381 li t9,8 382 subu t8,t9,t8 383 PTR_SUBU a2,a2,t8 384 PTR_ADDU a0,a0,t8 385 PTR_ADDU a1,a1,t8 386L(lb0): 387 388 andi t8,a1,(NSIZE-1) 389 lapc t9,L(jtable) 390 PTR_LSA t9,t8,t9,2 391 jrc t9 392L(jtable): 393 bc L(aligned) 394 bc L(r6_unaligned1) 395 bc L(r6_unaligned2) 396 bc L(r6_unaligned3) 397# ifdef USE_DOUBLE 398 bc L(r6_unaligned4) 399 bc L(r6_unaligned5) 400 bc L(r6_unaligned6) 401 bc L(r6_unaligned7) 402# endif 403#endif /* R6_CODE */ 404 405L(aligned): 406 407/* 408 * Now dst/src are both aligned to (word or double word) aligned addresses 409 * Set a2 to count how many bytes we have to copy after all the 64/128 byte 410 * chunks are copied and a3 to the dst pointer after all the 64/128 byte 411 * chunks have been copied. We will loop, incrementing a0 and a1 until a0 412 * equals a3. 413 */ 414 415 andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */ 416 beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */ 417 PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */ 418 PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */ 419 420/* When in the loop we may prefetch with the 'prepare to store' hint, 421 * in this case the a0+x should not be past the "t0-32" address. This 422 * means: for x=128 the last "safe" a0 address is "t0-160". Alternatively, 423 * for x=64 the last "safe" a0 address is "t0-96" In the current version we 424 * will use "prefetch hint,128(a0)", so "t0-160" is the limit. 425 */ 426#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 427 PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */ 428 PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */ 429#endif 430 PREFETCH_FOR_LOAD (0, a1) 431 PREFETCH_FOR_LOAD (1, a1) 432 PREFETCH_FOR_LOAD (2, a1) 433 PREFETCH_FOR_LOAD (3, a1) 434#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) 435 PREFETCH_FOR_STORE (1, a0) 436 PREFETCH_FOR_STORE (2, a0) 437 PREFETCH_FOR_STORE (3, a0) 438#endif 439#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH) 440# if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE 441 sltu v1,t9,a0 442 bgtz v1,L(skip_set) 443 nop 444 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4) 445L(skip_set): 446# else 447 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1) 448# endif 449#endif 450#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \ 451 && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) 452 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3) 453# ifdef USE_DOUBLE 454 PTR_ADDIU v0,v0,32 455# endif 456#endif 457L(loop16w): 458 C_LD t0,UNIT(0)(a1) 459#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 460 sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */ 461 bgtz v1,L(skip_pref) 462#endif 463 C_LD t1,UNIT(1)(a1) 464#ifndef R6_CODE 465 PREFETCH_FOR_STORE (4, a0) 466 PREFETCH_FOR_STORE (5, a0) 467#else 468 PREFETCH_FOR_STORE (2, a0) 469#endif 470#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) 471 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5) 472# ifdef USE_DOUBLE 473 PTR_ADDIU v0,v0,32 474# endif 475#endif 476L(skip_pref): 477 C_LD REG2,UNIT(2)(a1) 478 C_LD REG3,UNIT(3)(a1) 479 C_LD REG4,UNIT(4)(a1) 480 C_LD REG5,UNIT(5)(a1) 481 C_LD REG6,UNIT(6)(a1) 482 C_LD REG7,UNIT(7)(a1) 483#ifndef R6_CODE 484 PREFETCH_FOR_LOAD (4, a1) 485#else 486 PREFETCH_FOR_LOAD (3, a1) 487#endif 488 C_ST t0,UNIT(0)(a0) 489 C_ST t1,UNIT(1)(a0) 490 C_ST REG2,UNIT(2)(a0) 491 C_ST REG3,UNIT(3)(a0) 492 C_ST REG4,UNIT(4)(a0) 493 C_ST REG5,UNIT(5)(a0) 494 C_ST REG6,UNIT(6)(a0) 495 C_ST REG7,UNIT(7)(a0) 496 497 C_LD t0,UNIT(8)(a1) 498 C_LD t1,UNIT(9)(a1) 499 C_LD REG2,UNIT(10)(a1) 500 C_LD REG3,UNIT(11)(a1) 501 C_LD REG4,UNIT(12)(a1) 502 C_LD REG5,UNIT(13)(a1) 503 C_LD REG6,UNIT(14)(a1) 504 C_LD REG7,UNIT(15)(a1) 505#ifndef R6_CODE 506 PREFETCH_FOR_LOAD (5, a1) 507#endif 508 C_ST t0,UNIT(8)(a0) 509 C_ST t1,UNIT(9)(a0) 510 C_ST REG2,UNIT(10)(a0) 511 C_ST REG3,UNIT(11)(a0) 512 C_ST REG4,UNIT(12)(a0) 513 C_ST REG5,UNIT(13)(a0) 514 C_ST REG6,UNIT(14)(a0) 515 C_ST REG7,UNIT(15)(a0) 516 PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ 517 bne a0,a3,L(loop16w) 518 PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */ 519 move a2,t8 520 521/* Here we have src and dest word-aligned but less than 64-bytes or 522 * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there 523 * is one. Otherwise jump down to L(chk1w) to handle the tail end of 524 * the copy. 525 */ 526 527L(chkw): 528 PREFETCH_FOR_LOAD (0, a1) 529 andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */ 530 /* The t8 is the reminder count past 32-bytes */ 531 beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */ 532 nop 533 C_LD t0,UNIT(0)(a1) 534 C_LD t1,UNIT(1)(a1) 535 C_LD REG2,UNIT(2)(a1) 536 C_LD REG3,UNIT(3)(a1) 537 C_LD REG4,UNIT(4)(a1) 538 C_LD REG5,UNIT(5)(a1) 539 C_LD REG6,UNIT(6)(a1) 540 C_LD REG7,UNIT(7)(a1) 541 PTR_ADDIU a1,a1,UNIT(8) 542 C_ST t0,UNIT(0)(a0) 543 C_ST t1,UNIT(1)(a0) 544 C_ST REG2,UNIT(2)(a0) 545 C_ST REG3,UNIT(3)(a0) 546 C_ST REG4,UNIT(4)(a0) 547 C_ST REG5,UNIT(5)(a0) 548 C_ST REG6,UNIT(6)(a0) 549 C_ST REG7,UNIT(7)(a0) 550 PTR_ADDIU a0,a0,UNIT(8) 551 552/* 553 * Here we have less than 32(64) bytes to copy. Set up for a loop to 554 * copy one word (or double word) at a time. Set a2 to count how many 555 * bytes we have to copy after all the word (or double word) chunks are 556 * copied and a3 to the dst pointer after all the (d)word chunks have 557 * been copied. We will loop, incrementing a0 and a1 until a0 equals a3. 558 */ 559L(chk1w): 560 andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */ 561 beq a2,t8,L(lastw) 562 PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */ 563 PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */ 564 565/* copying in words (4-byte or 8-byte chunks) */ 566L(wordCopy_loop): 567 C_LD REG3,UNIT(0)(a1) 568 PTR_ADDIU a0,a0,UNIT(1) 569 PTR_ADDIU a1,a1,UNIT(1) 570 bne a0,a3,L(wordCopy_loop) 571 C_ST REG3,UNIT(-1)(a0) 572 573/* If we have been copying double words, see if we can copy a single word 574 before doing byte copies. We can have, at most, one word to copy. */ 575 576L(lastw): 577#ifdef USE_DOUBLE 578 andi t8,a2,3 /* a2 is the remainder past 4 byte chunks. */ 579 beq t8,a2,L(lastb) 580 move a2,t8 581 lw REG3,0(a1) 582 sw REG3,0(a0) 583 PTR_ADDIU a0,a0,4 584 PTR_ADDIU a1,a1,4 585#endif 586 587/* Copy the last 8 (or 16) bytes */ 588L(lastb): 589 blez a2,L(leave) 590 PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ 591L(lastbloop): 592 lb v1,0(a1) 593 PTR_ADDIU a0,a0,1 594 PTR_ADDIU a1,a1,1 595 bne a0,a3,L(lastbloop) 596 sb v1,-1(a0) 597L(leave): 598 j ra 599 nop 600 601/* We jump here with a memcpy of less than 8 or 16 bytes, depending on 602 whether or not USE_DOUBLE is defined. Instead of just doing byte 603 copies, check the alignment and size and use lw/sw if possible. 604 Otherwise, do byte copies. */ 605 606L(lasts): 607 andi t8,a2,3 608 beq t8,a2,L(lastb) 609 610 andi t9,a0,3 611 bne t9,zero,L(lastb) 612 andi t9,a1,3 613 bne t9,zero,L(lastb) 614 615 PTR_SUBU a3,a2,t8 616 PTR_ADDU a3,a0,a3 617 618L(wcopy_loop): 619 lw REG3,0(a1) 620 PTR_ADDIU a0,a0,4 621 PTR_ADDIU a1,a1,4 622 bne a0,a3,L(wcopy_loop) 623 sw REG3,-4(a0) 624 625 b L(lastb) 626 move a2,t8 627 628#ifndef R6_CODE 629/* 630 * UNALIGNED case, got here with a3 = "negu a0" 631 * This code is nearly identical to the aligned code above 632 * but only the destination (not the source) gets aligned 633 * so we need to do partial loads of the source followed 634 * by normal stores to the destination (once we have aligned 635 * the destination). 636 */ 637 638L(unaligned): 639 andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */ 640 beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */ 641 PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */ 642 643 C_LDHI v1,UNIT(0)(a1) 644 C_LDLO v1,UNITM1(1)(a1) 645 PTR_ADDU a1,a1,a3 646 C_STHI v1,UNIT(0)(a0) 647 PTR_ADDU a0,a0,a3 648 649/* 650 * Now the destination (but not the source) is aligned 651 * Set a2 to count how many bytes we have to copy after all the 64/128 byte 652 * chunks are copied and a3 to the dst pointer after all the 64/128 byte 653 * chunks have been copied. We will loop, incrementing a0 and a1 until a0 654 * equals a3. 655 */ 656 657L(ua_chk16w): 658 andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */ 659 beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */ 660 PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */ 661 PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */ 662 663# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 664 PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */ 665 PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */ 666# endif 667 PREFETCH_FOR_LOAD (0, a1) 668 PREFETCH_FOR_LOAD (1, a1) 669 PREFETCH_FOR_LOAD (2, a1) 670# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) 671 PREFETCH_FOR_STORE (1, a0) 672 PREFETCH_FOR_STORE (2, a0) 673 PREFETCH_FOR_STORE (3, a0) 674# endif 675# if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH) 676# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 677 sltu v1,t9,a0 678 bgtz v1,L(ua_skip_set) 679 nop 680 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4) 681L(ua_skip_set): 682# else 683 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1) 684# endif 685# endif 686L(ua_loop16w): 687 PREFETCH_FOR_LOAD (3, a1) 688 C_LDHI t0,UNIT(0)(a1) 689 C_LDHI t1,UNIT(1)(a1) 690 C_LDHI REG2,UNIT(2)(a1) 691# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 692 sltu v1,t9,a0 693 bgtz v1,L(ua_skip_pref) 694# endif 695 C_LDHI REG3,UNIT(3)(a1) 696 PREFETCH_FOR_STORE (4, a0) 697 PREFETCH_FOR_STORE (5, a0) 698L(ua_skip_pref): 699 C_LDHI REG4,UNIT(4)(a1) 700 C_LDHI REG5,UNIT(5)(a1) 701 C_LDHI REG6,UNIT(6)(a1) 702 C_LDHI REG7,UNIT(7)(a1) 703 C_LDLO t0,UNITM1(1)(a1) 704 C_LDLO t1,UNITM1(2)(a1) 705 C_LDLO REG2,UNITM1(3)(a1) 706 C_LDLO REG3,UNITM1(4)(a1) 707 C_LDLO REG4,UNITM1(5)(a1) 708 C_LDLO REG5,UNITM1(6)(a1) 709 C_LDLO REG6,UNITM1(7)(a1) 710 C_LDLO REG7,UNITM1(8)(a1) 711 PREFETCH_FOR_LOAD (4, a1) 712 C_ST t0,UNIT(0)(a0) 713 C_ST t1,UNIT(1)(a0) 714 C_ST REG2,UNIT(2)(a0) 715 C_ST REG3,UNIT(3)(a0) 716 C_ST REG4,UNIT(4)(a0) 717 C_ST REG5,UNIT(5)(a0) 718 C_ST REG6,UNIT(6)(a0) 719 C_ST REG7,UNIT(7)(a0) 720 C_LDHI t0,UNIT(8)(a1) 721 C_LDHI t1,UNIT(9)(a1) 722 C_LDHI REG2,UNIT(10)(a1) 723 C_LDHI REG3,UNIT(11)(a1) 724 C_LDHI REG4,UNIT(12)(a1) 725 C_LDHI REG5,UNIT(13)(a1) 726 C_LDHI REG6,UNIT(14)(a1) 727 C_LDHI REG7,UNIT(15)(a1) 728 C_LDLO t0,UNITM1(9)(a1) 729 C_LDLO t1,UNITM1(10)(a1) 730 C_LDLO REG2,UNITM1(11)(a1) 731 C_LDLO REG3,UNITM1(12)(a1) 732 C_LDLO REG4,UNITM1(13)(a1) 733 C_LDLO REG5,UNITM1(14)(a1) 734 C_LDLO REG6,UNITM1(15)(a1) 735 C_LDLO REG7,UNITM1(16)(a1) 736 PREFETCH_FOR_LOAD (5, a1) 737 C_ST t0,UNIT(8)(a0) 738 C_ST t1,UNIT(9)(a0) 739 C_ST REG2,UNIT(10)(a0) 740 C_ST REG3,UNIT(11)(a0) 741 C_ST REG4,UNIT(12)(a0) 742 C_ST REG5,UNIT(13)(a0) 743 C_ST REG6,UNIT(14)(a0) 744 C_ST REG7,UNIT(15)(a0) 745 PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ 746 bne a0,a3,L(ua_loop16w) 747 PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */ 748 move a2,t8 749 750/* Here we have src and dest word-aligned but less than 64-bytes or 751 * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there 752 * is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of 753 * the copy. */ 754 755L(ua_chkw): 756 PREFETCH_FOR_LOAD (0, a1) 757 andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */ 758 /* t8 is the reminder count past 32-bytes */ 759 beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */ 760 nop 761 C_LDHI t0,UNIT(0)(a1) 762 C_LDHI t1,UNIT(1)(a1) 763 C_LDHI REG2,UNIT(2)(a1) 764 C_LDHI REG3,UNIT(3)(a1) 765 C_LDHI REG4,UNIT(4)(a1) 766 C_LDHI REG5,UNIT(5)(a1) 767 C_LDHI REG6,UNIT(6)(a1) 768 C_LDHI REG7,UNIT(7)(a1) 769 C_LDLO t0,UNITM1(1)(a1) 770 C_LDLO t1,UNITM1(2)(a1) 771 C_LDLO REG2,UNITM1(3)(a1) 772 C_LDLO REG3,UNITM1(4)(a1) 773 C_LDLO REG4,UNITM1(5)(a1) 774 C_LDLO REG5,UNITM1(6)(a1) 775 C_LDLO REG6,UNITM1(7)(a1) 776 C_LDLO REG7,UNITM1(8)(a1) 777 PTR_ADDIU a1,a1,UNIT(8) 778 C_ST t0,UNIT(0)(a0) 779 C_ST t1,UNIT(1)(a0) 780 C_ST REG2,UNIT(2)(a0) 781 C_ST REG3,UNIT(3)(a0) 782 C_ST REG4,UNIT(4)(a0) 783 C_ST REG5,UNIT(5)(a0) 784 C_ST REG6,UNIT(6)(a0) 785 C_ST REG7,UNIT(7)(a0) 786 PTR_ADDIU a0,a0,UNIT(8) 787/* 788 * Here we have less than 32(64) bytes to copy. Set up for a loop to 789 * copy one word (or double word) at a time. 790 */ 791L(ua_chk1w): 792 andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */ 793 beq a2,t8,L(ua_smallCopy) 794 PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */ 795 PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */ 796 797/* copying in words (4-byte or 8-byte chunks) */ 798L(ua_wordCopy_loop): 799 C_LDHI v1,UNIT(0)(a1) 800 C_LDLO v1,UNITM1(1)(a1) 801 PTR_ADDIU a0,a0,UNIT(1) 802 PTR_ADDIU a1,a1,UNIT(1) 803 bne a0,a3,L(ua_wordCopy_loop) 804 C_ST v1,UNIT(-1)(a0) 805 806/* Copy the last 8 (or 16) bytes */ 807L(ua_smallCopy): 808 beqz a2,L(leave) 809 PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ 810L(ua_smallCopy_loop): 811 lb v1,0(a1) 812 PTR_ADDIU a0,a0,1 813 PTR_ADDIU a1,a1,1 814 bne a0,a3,L(ua_smallCopy_loop) 815 sb v1,-1(a0) 816 817 j ra 818 nop 819 820#else /* R6_CODE */ 821 822# if __MIPSEB 823# define SWAP_REGS(X,Y) X, Y 824# define ALIGN_OFFSET(N) (N) 825# else 826# define SWAP_REGS(X,Y) Y, X 827# define ALIGN_OFFSET(N) (NSIZE-N) 828# endif 829# define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \ 830 andi REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes. */ \ 831 beq REG7, a2, L(lastb); /* Check for bytes to copy by word */ \ 832 PTR_SUBU a3, a2, REG7; /* a3 is number of bytes to be copied in */ \ 833 /* (d)word chunks. */ \ 834 move a2, REG7; /* a2 is # of bytes to copy byte by byte */ \ 835 /* after word loop is finished. */ \ 836 PTR_ADDU REG6, a0, a3; /* REG6 is the dst address after loop. */ \ 837 PTR_SUBU REG2, a1, t8; /* REG2 is the aligned src address. */ \ 838 PTR_ADDU a1, a1, a3; /* a1 is addr of source after word loop. */ \ 839 C_LD t0, UNIT(0)(REG2); /* Load first part of source. */ \ 840L(r6_ua_wordcopy##BYTEOFFSET): \ 841 C_LD t1, UNIT(1)(REG2); /* Load second part of source. */ \ 842 C_ALIGN REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET); \ 843 PTR_ADDIU a0, a0, UNIT(1); /* Increment destination pointer. */ \ 844 PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \ 845 move t0, t1; /* Move second part of source to first. */ \ 846 bne a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET); \ 847 C_ST REG3, UNIT(-1)(a0); \ 848 j L(lastb); \ 849 nop 850 851 /* We are generating R6 code, the destination is 4 byte aligned and 852 the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the 853 alignment of the source. */ 854 855L(r6_unaligned1): 856 R6_UNALIGNED_WORD_COPY(1) 857L(r6_unaligned2): 858 R6_UNALIGNED_WORD_COPY(2) 859L(r6_unaligned3): 860 R6_UNALIGNED_WORD_COPY(3) 861# ifdef USE_DOUBLE 862L(r6_unaligned4): 863 R6_UNALIGNED_WORD_COPY(4) 864L(r6_unaligned5): 865 R6_UNALIGNED_WORD_COPY(5) 866L(r6_unaligned6): 867 R6_UNALIGNED_WORD_COPY(6) 868L(r6_unaligned7): 869 R6_UNALIGNED_WORD_COPY(7) 870# endif 871#endif /* R6_CODE */ 872 873 .set at 874 .set reorder 875END(MEMCPY_NAME) 876