1#include <picolibc.h> 2 3! 4! Fast SH memcpy 5! 6! by Toshiyasu Morita (tm@netcom.com) 7! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut) 8/* SH5 code Copyright 2002 SuperH Ltd. */ 9! 10! Entry: ARG0: destination pointer 11! ARG1: source pointer 12! ARG3: byte count 13! 14! Exit: RESULT: destination pointer 15! any other registers in the range r0-r7: trashed 16! 17! Notes: Usually one wants to do small reads and write a longword, but 18! unfortunately it is difficult in some cases to concatanate bytes 19! into a longword on the SH, so this does a longword read and small 20! writes. 21! 22! This implementation makes two assumptions about how it is called: 23! 24! 1.: If the byte count is nonzero, the address of the last byte to be 25! copied is unsigned greater than the address of the first byte to 26! be copied. This could be easily swapped for a signed comparison, 27! but the algorithm used needs some comparison. 28! 29! 2.: When there are two or three bytes in the last word of an 11-or-more 30! bytes memory chunk to b copied, the rest of the word can be read 31! without side effects. 32! This could be easily changed by increasing the minumum size of 33! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2, 34! however, this would cost a few extra cyles on average. 35! For SHmedia, the assumption is that any quadword can be read in its 36! enirety if at least one byte is included in the copy. 37! 38 39#include "asm.h" 40 41ENTRY(memcpy) 42 43#if __SHMEDIA__ 44 45#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1 46#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1 47#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1 48#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1 49 50 ld.b r3,0,r63 51 pta/l Large,tr0 52 movi 25,r0 53 bgeu/u r4,r0,tr0 54 nsb r4,r0 55 shlli r0,5,r0 56 movi (L1-L0+63*32 + 1) & 0xffff,r1 57 sub r1, r0, r0 58L0: ptrel r0,tr0 59 add r2,r4,r5 60 ptabs r18,tr1 61 add r3,r4,r6 62 blink tr0,r63 63 64 .balign 8 65L1: 66 /* 0 byte memcpy */ 67 blink tr1,r63 68 69L4_7: /* 4..7 byte memcpy cntd. */ 70 stlo.l r2, 0, r0 71 or r6, r7, r6 72 sthi.l r5, -1, r6 73 stlo.l r5, -4, r6 74 blink tr1,r63 75 76L2_3: /* 2 or 3 byte memcpy cntd. */ 77 st.b r5,-1,r6 78 blink tr1,r63 79 80 /* 1 byte memcpy */ 81 ld.b r3,0,r0 82 st.b r2,0,r0 83 blink tr1,r63 84 85L8_15: /* 8..15 byte memcpy cntd. */ 86 stlo.q r2, 0, r0 87 or r6, r7, r6 88 sthi.q r5, -1, r6 89 stlo.q r5, -8, r6 90 blink tr1,r63 91 92 /* 2 or 3 byte memcpy */ 93 ld.b r3,0,r0 94 ld.b r2,0,r63 95 ld.b r3,1,r1 96 st.b r2,0,r0 97 pta/l L2_3,tr0 98 ld.b r6,-1,r6 99 st.b r2,1,r1 100 blink tr0, r63 101 102 /* 4 .. 7 byte memcpy */ 103 LDUAL (r3, 0, r0, r1) 104 pta L4_7, tr0 105 ldlo.l r6, -4, r7 106 or r0, r1, r0 107 sthi.l r2, 3, r0 108 ldhi.l r6, -1, r6 109 blink tr0, r63 110 111 /* 8 .. 15 byte memcpy */ 112 LDUAQ (r3, 0, r0, r1) 113 pta L8_15, tr0 114 ldlo.q r6, -8, r7 115 or r0, r1, r0 116 sthi.q r2, 7, r0 117 ldhi.q r6, -1, r6 118 blink tr0, r63 119 120 /* 16 .. 24 byte memcpy */ 121 LDUAQ (r3, 0, r0, r1) 122 LDUAQ (r3, 8, r8, r9) 123 or r0, r1, r0 124 sthi.q r2, 7, r0 125 or r8, r9, r8 126 sthi.q r2, 15, r8 127 ldlo.q r6, -8, r7 128 ldhi.q r6, -1, r6 129 stlo.q r2, 8, r8 130 stlo.q r2, 0, r0 131 or r6, r7, r6 132 sthi.q r5, -1, r6 133 stlo.q r5, -8, r6 134 blink tr1,r63 135 136Large: 137 ld.b r2, 0, r63 138 pta/l Loop_ua, tr1 139 ori r3, -8, r7 140 sub r2, r7, r22 141 sub r3, r2, r6 142 add r2, r4, r5 143 ldlo.q r3, 0, r0 144 addi r5, -16, r5 145 movi 64+8, r27 // could subtract r7 from that. 146 stlo.q r2, 0, r0 147 sthi.q r2, 7, r0 148 ldx.q r22, r6, r0 149 bgtu/l r27, r4, tr1 150 151 addi r5, -48, r27 152 pta/l Loop_line, tr0 153 addi r6, 64, r36 154 addi r6, -24, r19 155 addi r6, -16, r20 156 addi r6, -8, r21 157 158Loop_line: 159 ldx.q r22, r36, r63 160 alloco r22, 32 161 addi r22, 32, r22 162 ldx.q r22, r19, r23 163 sthi.q r22, -25, r0 164 ldx.q r22, r20, r24 165 ldx.q r22, r21, r25 166 stlo.q r22, -32, r0 167 ldx.q r22, r6, r0 168 sthi.q r22, -17, r23 169 sthi.q r22, -9, r24 170 sthi.q r22, -1, r25 171 stlo.q r22, -24, r23 172 stlo.q r22, -16, r24 173 stlo.q r22, -8, r25 174 bgeu r27, r22, tr0 175 176Loop_ua: 177 addi r22, 8, r22 178 sthi.q r22, -1, r0 179 stlo.q r22, -8, r0 180 ldx.q r22, r6, r0 181 bgtu/l r5, r22, tr1 182 183 add r3, r4, r7 184 ldlo.q r7, -8, r1 185 sthi.q r22, 7, r0 186 ldhi.q r7, -1, r7 187 ptabs r18,tr1 188 stlo.q r22, 0, r0 189 or r1, r7, r1 190 sthi.q r5, 15, r1 191 stlo.q r5, 8, r1 192 blink tr1, r63 193 194#else /* ! SHMEDIA, i.e. SH1 .. SH4 / SHcompact */ 195 196#ifdef __SH5__ 197#define DST r2 198#define SRC r3 199#define COUNT r4 200#define TMP0 r5 201#define TMP1 r6 202#define RESULT r2 203#else 204#define DST r4 205#define SRC r5 206#define COUNT r6 207#define TMP0 r2 208#define TMP1 r3 209#define RESULT r0 210#endif 211 212#ifdef __LITTLE_ENDIAN__ 213 ! Little endian version copies with increasing addresses. 214 mov DST,TMP1 ! Save return value 215 mov #11,r0 ! Check if small number of bytes 216 cmp/hs r0,COUNT 217 ! COUNT becomes src end address 218 SL(bf, L_small, add SRC,COUNT) 219 mov #1,r1 220 tst r1,SRC ! check if source even 221 SL(bt, L_even, mov COUNT,r7) 222 mov.b @SRC+,r0 ! no, make it even. 223 mov.b r0,@DST 224 add #1,DST 225L_even: tst r1,DST ! check if destination is even 226 add #-3,r7 227 SL(bf, L_odddst, mov #2,r1) 228 tst r1,DST ! check if destination is 4-byte aligned 229 mov DST,r0 230 SL(bt, L_al4dst, sub SRC,r0) 231 mov.w @SRC+,TMP0 232 mov.w TMP0,@DST 233 ! add #2,DST DST is dead here. 234L_al4dst: 235 tst r1,SRC 236 bt L_al4both 237 mov.w @SRC+,r1 238 swap.w r1,r1 239 add #-6,r0 240 add #-6,r7 ! r7 := src end address minus 9. 241 .align 2 242L_2l_loop: 243 mov.l @SRC+,TMP0 ! Read & write two longwords per iteration 244 xtrct TMP0,r1 245 mov.l r1,@(r0,SRC) 246 cmp/hs r7,SRC 247 mov.l @SRC+,r1 248 xtrct r1,TMP0 249 mov.l TMP0,@(r0,SRC) 250 bf L_2l_loop 251 add #-2,SRC 252 bra L_cleanup 253 add #5,r0 254L_al4both: 255 add #-4,r0 256 .align 2 257L_al4both_loop: 258 mov.l @SRC+,DST ! Read longword, write longword per iteration 259 cmp/hs r7,SRC 260 SL(bf, L_al4both_loop, mov.l DST,@(r0,SRC)) 261 262 bra L_cleanup 263 add #3,r0 264 265L_odddst: 266 tst r1,SRC 267 SL(bt, L_al4src, add #-1,DST) 268 mov.w @SRC+,r0 269 mov.b r0,@(1,DST) 270 shlr8 r0 271 mov.b r0,@(2,DST) 272 add #2,DST 273L_al4src: 274 .align 2 275L_odd_loop: 276 mov.l @SRC+,r0 ! Read longword, write byte, word, byte per iteration 277 cmp/hs r7,SRC 278 mov.b r0,@(1,DST) 279 shlr8 r0 280 mov.w r0,@(2,DST) 281 shlr16 r0 282 mov.b r0,@(4,DST) 283 SL(bf, L_odd_loop, add #4,DST) 284 .align 2 ! avoid nop in more frequently executed code. 285L_cleanup2: 286 mov DST,r0 287 sub SRC,r0 288L_cleanup: 289 cmp/eq COUNT,SRC 290 bt L_ready 291 .align 2 292L_cleanup_loop: 293 mov.b @SRC+,r1 294 cmp/eq COUNT,SRC 295 mov.b r1,@(r0,SRC) 296 bf L_cleanup_loop 297L_ready: 298 rts 299 mov TMP1,RESULT 300L_small: 301 bra L_cleanup2 302 add #-1,DST 303#else /* ! __LITTLE_ENDIAN__ */ 304 ! Big endian version copies with decreasing addresses. 305 mov DST,r0 306 add COUNT,r0 307 sub DST,SRC 308 mov #11,r1 309 cmp/hs r1,COUNT 310 SL(bf, L_small, add #-1,SRC) 311 mov SRC,TMP1 312 add r0,TMP1 313 shlr TMP1 314 SL(bt, L_even, 315 mov DST,r7) 316 mov.b @(r0,SRC),TMP0 317 add #-1,TMP1 318 mov.b TMP0,@-r0 319L_even: 320 tst #1,r0 321 add #-1,SRC 322 SL(bf, L_odddst, add #8,r7) 323 tst #2,r0 324 bt L_al4dst 325 add #-1,TMP1 326 mov.w @(r0,SRC),r1 327 mov.w r1,@-r0 328L_al4dst: 329 shlr TMP1 330 bt L_al4both 331 mov.w @(r0,SRC),r1 332 swap.w r1,r1 333 add #4,r7 334 add #-4,SRC 335 .align 2 336L_2l_loop: 337 mov.l @(r0,SRC),TMP0 338 xtrct TMP0,r1 339 mov.l r1,@-r0 340 cmp/hs r7,r0 341 mov.l @(r0,SRC),r1 342 xtrct r1,TMP0 343 mov.l TMP0,@-r0 344 bt L_2l_loop 345 bra L_cleanup 346 add #5,SRC 347 348 nop ! avoid nop in executed code. 349L_al4both: 350 add #-2,SRC 351 .align 2 352L_al4both_loop: 353 mov.l @(r0,SRC),r1 354 cmp/hs r7,r0 355 SL(bt, L_al4both_loop, 356 mov.l r1,@-r0) 357 bra L_cleanup 358 add #3,SRC 359 360 nop ! avoid nop in executed code. 361L_odddst: 362 shlr TMP1 363 bt L_al4src 364 mov.w @(r0,SRC),r1 365 mov.b r1,@-r0 366 shlr8 r1 367 mov.b r1,@-r0 368L_al4src: 369 add #-2,SRC 370 .align 2 371L_odd_loop: 372 mov.l @(r0,SRC),TMP0 373 cmp/hs r7,r0 374 mov.b TMP0,@-r0 375 shlr8 TMP0 376 mov.w TMP0,@-r0 377 shlr16 TMP0 378 mov.b TMP0,@-r0 379 bt L_odd_loop 380 381 add #3,SRC 382L_cleanup: 383L_small: 384 cmp/eq DST,r0 385 bt L_ready 386 add #1,DST 387 .align 2 388L_cleanup_loop: 389 mov.b @(r0,SRC),TMP0 390 cmp/eq DST,r0 391 mov.b TMP0,@-r0 392 bf L_cleanup_loop 393L_ready: 394 rts 395 mov r0,RESULT 396#endif /* ! __LITTLE_ENDIAN__ */ 397#endif /* ! SHMEDIA */ 398