1/* 2 * Copyright (c) 2013 3 * MIPS Technologies, Inc., California. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its 14 * contributors may be used to endorse or promote products derived from 15 * this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include <picolibc.h> 31 32#ifdef ANDROID_CHANGES 33# include "machine/asm.h" 34# include "machine/regdef.h" 35# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE 36#elif _LIBC 37# include "machine/asm.h" 38# include "machine/regdef.h" 39# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE 40#else 41# include <regdef.h> 42# include <sys/asm.h> 43#endif 44 45/* Check to see if the MIPS architecture we are compiling for supports 46 prefetching. */ 47 48#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64) 49# ifndef DISABLE_PREFETCH 50# define USE_PREFETCH 51# endif 52#endif 53 54#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)) 55# ifndef DISABLE_DOUBLE 56# define USE_DOUBLE 57# endif 58#endif 59 60#ifndef USE_DOUBLE 61# ifndef DISABLE_DOUBLE_ALIGN 62# define DOUBLE_ALIGN 63# endif 64#endif 65 66/* Some asm.h files do not have the L macro definition. */ 67#ifndef L 68# if _MIPS_SIM == _ABIO32 69# define L(label) $L ## label 70# else 71# define L(label) .L ## label 72# endif 73#endif 74 75/* Some asm.h files do not have the PTR_ADDIU macro definition. */ 76#ifndef PTR_ADDIU 77# ifdef USE_DOUBLE 78# define PTR_ADDIU daddiu 79# else 80# define PTR_ADDIU addiu 81# endif 82#endif 83 84/* New R6 instructions that may not be in asm.h. */ 85#ifndef PTR_LSA 86# if _MIPS_SIM == _ABI64 87# define PTR_LSA dlsa 88# else 89# define PTR_LSA lsa 90# endif 91#endif 92 93/* Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE 94 or PREFETCH_STORE_STREAMED offers a large performance advantage 95 but PREPAREFORSTORE has some special restrictions to consider. 96 97 Prefetch with the 'prepare for store' hint does not copy a memory 98 location into the cache, it just allocates a cache line and zeros 99 it out. This means that if you do not write to the entire cache 100 line before writing it out to memory some data will get zero'ed out 101 when the cache line is written back to memory and data will be lost. 102 103 There are ifdef'ed sections of this memcpy to make sure that it does not 104 do prefetches on cache lines that are not going to be completely written. 105 This code is only needed and only used when PREFETCH_STORE_HINT is set to 106 PREFETCH_HINT_PREPAREFORSTORE. This code assumes that cache lines are 107 less than MAX_PREFETCH_SIZE bytes and if the cache line is larger it will 108 not work correctly. */ 109 110#ifdef USE_PREFETCH 111# define PREFETCH_HINT_STORE 1 112# define PREFETCH_HINT_STORE_STREAMED 5 113# define PREFETCH_HINT_STORE_RETAINED 7 114# define PREFETCH_HINT_PREPAREFORSTORE 30 115 116/* If we have not picked out what hints to use at this point use the 117 standard load and store prefetch hints. */ 118# ifndef PREFETCH_STORE_HINT 119# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE 120# endif 121 122/* We double everything when USE_DOUBLE is true so we do 2 prefetches to 123 get 64 bytes in that case. The assumption is that each individual 124 prefetch brings in 32 bytes. */ 125# ifdef USE_DOUBLE 126# define PREFETCH_CHUNK 64 127# define PREFETCH_FOR_STORE(chunk, reg) \ 128 pref PREFETCH_STORE_HINT, (chunk)*64(reg); \ 129 pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg) 130# else 131# define PREFETCH_CHUNK 32 132# define PREFETCH_FOR_STORE(chunk, reg) \ 133 pref PREFETCH_STORE_HINT, (chunk)*32(reg) 134# endif 135 136/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less 137 than PREFETCH_CHUNK, the assumed size of each prefetch. If the real size 138 of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE 139 hint is used, the code will not work correctly. If PREPAREFORSTORE is not 140 used than MAX_PREFETCH_SIZE does not matter. */ 141# define MAX_PREFETCH_SIZE 128 142/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater 143 than 5 on a STORE prefetch and that a single prefetch can never be larger 144 than MAX_PREFETCH_SIZE. We add the extra 32 when USE_DOUBLE is set because 145 we actually do two prefetches in that case, one 32 bytes after the other. */ 146# ifdef USE_DOUBLE 147# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE 148# else 149# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE 150# endif 151 152# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \ 153 && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE) 154/* We cannot handle this because the initial prefetches may fetch bytes that 155 are before the buffer being copied. We start copies with an offset 156 of 4 so avoid this situation when using PREPAREFORSTORE. */ 157# error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small." 158# endif 159#else /* USE_PREFETCH not defined */ 160# define PREFETCH_FOR_STORE(offset, reg) 161#endif 162 163#if __mips_isa_rev > 5 164# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 165# undef PREFETCH_STORE_HINT 166# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED 167# endif 168# define R6_CODE 169#endif 170 171/* Allow the routine to be named something else if desired. */ 172#ifndef MEMSET_NAME 173# define MEMSET_NAME memset 174#endif 175 176/* We load/store 64 bits at a time when USE_DOUBLE is true. 177 The C_ prefix stands for CHUNK and is used to avoid macro name 178 conflicts with system header files. */ 179 180#ifdef USE_DOUBLE 181# define C_ST sd 182# if __MIPSEB 183# define C_STHI sdl /* high part is left in big-endian */ 184# else 185# define C_STHI sdr /* high part is right in little-endian */ 186# endif 187#else 188# define C_ST sw 189# if __MIPSEB 190# define C_STHI swl /* high part is left in big-endian */ 191# else 192# define C_STHI swr /* high part is right in little-endian */ 193# endif 194#endif 195 196/* Bookkeeping values for 32 vs. 64 bit mode. */ 197#ifdef USE_DOUBLE 198# define NSIZE 8 199# define NSIZEMASK 0x3f 200# define NSIZEDMASK 0x7f 201#else 202# define NSIZE 4 203# define NSIZEMASK 0x1f 204# define NSIZEDMASK 0x3f 205#endif 206#define UNIT(unit) ((unit)*NSIZE) 207#define UNITM1(unit) (((unit)*NSIZE)-1) 208 209#ifdef ANDROID_CHANGES 210LEAF(MEMSET_NAME,0) 211#else 212LEAF(MEMSET_NAME) 213#endif 214 215 .set nomips16 216 .set noreorder 217/* If the size is less than 2*NSIZE (8 or 16), go to L(lastb). Regardless of 218 size, copy dst pointer to v0 for the return value. */ 219 slti t2,a2,(2 * NSIZE) 220 bne t2,zero,L(lastb) 221 move v0,a0 222 223/* If memset value is not zero, we copy it to all the bytes in a 32 or 64 224 bit word. */ 225 beq a1,zero,L(set0) /* If memset value is zero no smear */ 226 PTR_SUBU a3,zero,a0 227 nop 228 229 /* smear byte into 32 or 64 bit word */ 230#if ((__mips == 64) || (__mips == 32)) && (__mips_isa_rev >= 2) 231# ifdef USE_DOUBLE 232 dins a1, a1, 8, 8 /* Replicate fill byte into half-word. */ 233 dins a1, a1, 16, 16 /* Replicate fill byte into word. */ 234 dins a1, a1, 32, 32 /* Replicate fill byte into dbl word. */ 235# else 236 ins a1, a1, 8, 8 /* Replicate fill byte into half-word. */ 237 ins a1, a1, 16, 16 /* Replicate fill byte into word. */ 238# endif 239#else 240# ifdef USE_DOUBLE 241 and a1,0xff 242 dsll t2,a1,8 243 or a1,t2 244 dsll t2,a1,16 245 or a1,t2 246 dsll t2,a1,32 247 or a1,t2 248# else 249 and a1,0xff 250 sll t2,a1,8 251 or a1,t2 252 sll t2,a1,16 253 or a1,t2 254# endif 255#endif 256 257/* If the destination address is not aligned do a partial store to get it 258 aligned. If it is already aligned just jump to L(aligned). */ 259L(set0): 260#ifndef R6_CODE 261 andi t2,a3,(NSIZE-1) /* word-unaligned address? */ 262 beq t2,zero,L(aligned) /* t2 is the unalignment count */ 263 PTR_SUBU a2,a2,t2 264 C_STHI a1,0(a0) 265 PTR_ADDU a0,a0,t2 266#else /* R6_CODE */ 267 andi t2,a0,(NSIZE-1) 268 lapc t9,L(atable) 269 PTR_LSA t9,t2,t9,2 270 jrc t9 271L(atable): 272 bc L(aligned) 273# ifdef USE_DOUBLE 274 bc L(lb7) 275 bc L(lb6) 276 bc L(lb5) 277 bc L(lb4) 278# endif 279 bc L(lb3) 280 bc L(lb2) 281 bc L(lb1) 282L(lb7): 283 sb a1,6(a0) 284L(lb6): 285 sb a1,5(a0) 286L(lb5): 287 sb a1,4(a0) 288L(lb4): 289 sb a1,3(a0) 290L(lb3): 291 sb a1,2(a0) 292L(lb2): 293 sb a1,1(a0) 294L(lb1): 295 sb a1,0(a0) 296 297 li t9,NSIZE 298 subu t2,t9,t2 299 PTR_SUBU a2,a2,t2 300 PTR_ADDU a0,a0,t2 301#endif /* R6_CODE */ 302 303L(aligned): 304/* If USE_DOUBLE is not set we may still want to align the data on a 16 305 byte boundry instead of an 8 byte boundry to maximize the opportunity 306 of proAptiv chips to do memory bonding (combining two sequential 4 307 byte stores into one 8 byte store). We know there are at least 4 bytes 308 left to store or we would have jumped to L(lastb) earlier in the code. */ 309#ifdef DOUBLE_ALIGN 310 andi t2,a3,4 311 beq t2,zero,L(double_aligned) 312 PTR_SUBU a2,a2,t2 313 sw a1,0(a0) 314 PTR_ADDU a0,a0,t2 315L(double_aligned): 316#endif 317 318/* Now the destination is aligned to (word or double word) aligned address 319 Set a2 to count how many bytes we have to copy after all the 64/128 byte 320 chunks are copied and a3 to the dest pointer after all the 64/128 byte 321 chunks have been copied. We will loop, incrementing a0 until it equals 322 a3. */ 323 andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */ 324 beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */ 325 PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */ 326 PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */ 327 328/* When in the loop we may prefetch with the 'prepare to store' hint, 329 in this case the a0+x should not be past the "t0-32" address. This 330 means: for x=128 the last "safe" a0 address is "t0-160". Alternatively, 331 for x=64 the last "safe" a0 address is "t0-96" In the current version we 332 will use "prefetch hint,128(a0)", so "t0-160" is the limit. */ 333#if defined(USE_PREFETCH) \ 334 && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 335 PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */ 336 PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */ 337#endif 338#if defined(USE_PREFETCH) \ 339 && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) 340 PREFETCH_FOR_STORE (1, a0) 341 PREFETCH_FOR_STORE (2, a0) 342 PREFETCH_FOR_STORE (3, a0) 343#endif 344 345L(loop16w): 346#if defined(USE_PREFETCH) \ 347 && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 348 sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */ 349 bgtz v1,L(skip_pref) 350 nop 351#endif 352#ifndef R6_CODE 353 PREFETCH_FOR_STORE (4, a0) 354 PREFETCH_FOR_STORE (5, a0) 355#else 356 PREFETCH_FOR_STORE (2, a0) 357#endif 358L(skip_pref): 359 C_ST a1,UNIT(0)(a0) 360 C_ST a1,UNIT(1)(a0) 361 C_ST a1,UNIT(2)(a0) 362 C_ST a1,UNIT(3)(a0) 363 C_ST a1,UNIT(4)(a0) 364 C_ST a1,UNIT(5)(a0) 365 C_ST a1,UNIT(6)(a0) 366 C_ST a1,UNIT(7)(a0) 367 C_ST a1,UNIT(8)(a0) 368 C_ST a1,UNIT(9)(a0) 369 C_ST a1,UNIT(10)(a0) 370 C_ST a1,UNIT(11)(a0) 371 C_ST a1,UNIT(12)(a0) 372 C_ST a1,UNIT(13)(a0) 373 C_ST a1,UNIT(14)(a0) 374 C_ST a1,UNIT(15)(a0) 375 PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ 376 bne a0,a3,L(loop16w) 377 nop 378 move a2,t8 379 380/* Here we have dest word-aligned but less than 64-bytes or 128 bytes to go. 381 Check for a 32(64) byte chunk and copy if if there is one. Otherwise 382 jump down to L(chk1w) to handle the tail end of the copy. */ 383L(chkw): 384 andi t8,a2,NSIZEMASK /* is there a 32-byte/64-byte chunk. */ 385 /* the t8 is the reminder count past 32-bytes */ 386 beq a2,t8,L(chk1w)/* when a2==t8, no 32-byte chunk */ 387 nop 388 C_ST a1,UNIT(0)(a0) 389 C_ST a1,UNIT(1)(a0) 390 C_ST a1,UNIT(2)(a0) 391 C_ST a1,UNIT(3)(a0) 392 C_ST a1,UNIT(4)(a0) 393 C_ST a1,UNIT(5)(a0) 394 C_ST a1,UNIT(6)(a0) 395 C_ST a1,UNIT(7)(a0) 396 PTR_ADDIU a0,a0,UNIT(8) 397 398/* Here we have less than 32(64) bytes to set. Set up for a loop to 399 copy one word (or double word) at a time. Set a2 to count how many 400 bytes we have to copy after all the word (or double word) chunks are 401 copied and a3 to the dest pointer after all the (d)word chunks have 402 been copied. We will loop, incrementing a0 until a0 equals a3. */ 403L(chk1w): 404 andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */ 405 beq a2,t8,L(lastb) 406 PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */ 407 PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */ 408 409/* copying in words (4-byte or 8 byte chunks) */ 410L(wordCopy_loop): 411 PTR_ADDIU a0,a0,UNIT(1) 412 bne a0,a3,L(wordCopy_loop) 413 C_ST a1,UNIT(-1)(a0) 414 415/* Copy the last 8 (or 16) bytes */ 416L(lastb): 417 blez a2,L(leave) 418 PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ 419L(lastbloop): 420 PTR_ADDIU a0,a0,1 421 bne a0,a3,L(lastbloop) 422 sb a1,-1(a0) 423L(leave): 424 j ra 425 nop 426 427 .set at 428 .set reorder 429END(MEMSET_NAME) 430