1! 2! Fast SH memcpy 3! 4! by Toshiyasu Morita (tm@netcom.com) 5! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut) 6/* SH5 code Copyright 2002 SuperH Ltd. */ 7! 8! Entry: ARG0: destination pointer 9! ARG1: source pointer 10! ARG3: byte count 11! 12! Exit: RESULT: destination pointer 13! any other registers in the range r0-r7: trashed 14! 15! Notes: Usually one wants to do small reads and write a longword, but 16! unfortunately it is difficult in some cases to concatanate bytes 17! into a longword on the SH, so this does a longword read and small 18! writes. 19! 20! This implementation makes two assumptions about how it is called: 21! 22! 1.: If the byte count is nonzero, the address of the last byte to be 23! copied is unsigned greater than the address of the first byte to 24! be copied. This could be easily swapped for a signed comparison, 25! but the algorithm used needs some comparison. 26! 27! 2.: When there are two or three bytes in the last word of an 11-or-more 28! bytes memory chunk to b copied, the rest of the word can be read 29! without side effects. 30! This could be easily changed by increasing the minumum size of 31! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2, 32! however, this would cost a few extra cyles on average. 33! For SHmedia, the assumption is that any quadword can be read in its 34! enirety if at least one byte is included in the copy. 35! 36 37#include "asm.h" 38 39ENTRY(memcpy) 40 41#if __SHMEDIA__ 42 43#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1 44#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1 45#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1 46#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1 47 48 ld.b r3,0,r63 49 pta/l Large,tr0 50 movi 25,r0 51 bgeu/u r4,r0,tr0 52 nsb r4,r0 53 shlli r0,5,r0 54 movi (L1-L0+63*32 + 1) & 0xffff,r1 55 sub r1, r0, r0 56L0: ptrel r0,tr0 57 add r2,r4,r5 58 ptabs r18,tr1 59 add r3,r4,r6 60 blink tr0,r63 61 62 .balign 8 63L1: 64 /* 0 byte memcpy */ 65 blink tr1,r63 66 67L4_7: /* 4..7 byte memcpy cntd. */ 68 stlo.l r2, 0, r0 69 or r6, r7, r6 70 sthi.l r5, -1, r6 71 stlo.l r5, -4, r6 72 blink tr1,r63 73 74L2_3: /* 2 or 3 byte memcpy cntd. */ 75 st.b r5,-1,r6 76 blink tr1,r63 77 78 /* 1 byte memcpy */ 79 ld.b r3,0,r0 80 st.b r2,0,r0 81 blink tr1,r63 82 83L8_15: /* 8..15 byte memcpy cntd. */ 84 stlo.q r2, 0, r0 85 or r6, r7, r6 86 sthi.q r5, -1, r6 87 stlo.q r5, -8, r6 88 blink tr1,r63 89 90 /* 2 or 3 byte memcpy */ 91 ld.b r3,0,r0 92 ld.b r2,0,r63 93 ld.b r3,1,r1 94 st.b r2,0,r0 95 pta/l L2_3,tr0 96 ld.b r6,-1,r6 97 st.b r2,1,r1 98 blink tr0, r63 99 100 /* 4 .. 7 byte memcpy */ 101 LDUAL (r3, 0, r0, r1) 102 pta L4_7, tr0 103 ldlo.l r6, -4, r7 104 or r0, r1, r0 105 sthi.l r2, 3, r0 106 ldhi.l r6, -1, r6 107 blink tr0, r63 108 109 /* 8 .. 15 byte memcpy */ 110 LDUAQ (r3, 0, r0, r1) 111 pta L8_15, tr0 112 ldlo.q r6, -8, r7 113 or r0, r1, r0 114 sthi.q r2, 7, r0 115 ldhi.q r6, -1, r6 116 blink tr0, r63 117 118 /* 16 .. 24 byte memcpy */ 119 LDUAQ (r3, 0, r0, r1) 120 LDUAQ (r3, 8, r8, r9) 121 or r0, r1, r0 122 sthi.q r2, 7, r0 123 or r8, r9, r8 124 sthi.q r2, 15, r8 125 ldlo.q r6, -8, r7 126 ldhi.q r6, -1, r6 127 stlo.q r2, 8, r8 128 stlo.q r2, 0, r0 129 or r6, r7, r6 130 sthi.q r5, -1, r6 131 stlo.q r5, -8, r6 132 blink tr1,r63 133 134Large: 135 ld.b r2, 0, r63 136 pta/l Loop_ua, tr1 137 ori r3, -8, r7 138 sub r2, r7, r22 139 sub r3, r2, r6 140 add r2, r4, r5 141 ldlo.q r3, 0, r0 142 addi r5, -16, r5 143 movi 64+8, r27 // could subtract r7 from that. 144 stlo.q r2, 0, r0 145 sthi.q r2, 7, r0 146 ldx.q r22, r6, r0 147 bgtu/l r27, r4, tr1 148 149 addi r5, -48, r27 150 pta/l Loop_line, tr0 151 addi r6, 64, r36 152 addi r6, -24, r19 153 addi r6, -16, r20 154 addi r6, -8, r21 155 156Loop_line: 157 ldx.q r22, r36, r63 158 alloco r22, 32 159 addi r22, 32, r22 160 ldx.q r22, r19, r23 161 sthi.q r22, -25, r0 162 ldx.q r22, r20, r24 163 ldx.q r22, r21, r25 164 stlo.q r22, -32, r0 165 ldx.q r22, r6, r0 166 sthi.q r22, -17, r23 167 sthi.q r22, -9, r24 168 sthi.q r22, -1, r25 169 stlo.q r22, -24, r23 170 stlo.q r22, -16, r24 171 stlo.q r22, -8, r25 172 bgeu r27, r22, tr0 173 174Loop_ua: 175 addi r22, 8, r22 176 sthi.q r22, -1, r0 177 stlo.q r22, -8, r0 178 ldx.q r22, r6, r0 179 bgtu/l r5, r22, tr1 180 181 add r3, r4, r7 182 ldlo.q r7, -8, r1 183 sthi.q r22, 7, r0 184 ldhi.q r7, -1, r7 185 ptabs r18,tr1 186 stlo.q r22, 0, r0 187 or r1, r7, r1 188 sthi.q r5, 15, r1 189 stlo.q r5, 8, r1 190 blink tr1, r63 191 192#else /* ! SHMEDIA, i.e. SH1 .. SH4 / SHcompact */ 193 194#ifdef __SH5__ 195#define DST r2 196#define SRC r3 197#define COUNT r4 198#define TMP0 r5 199#define TMP1 r6 200#define RESULT r2 201#else 202#define DST r4 203#define SRC r5 204#define COUNT r6 205#define TMP0 r2 206#define TMP1 r3 207#define RESULT r0 208#endif 209 210#ifdef __LITTLE_ENDIAN__ 211 ! Little endian version copies with increasing addresses. 212 mov DST,TMP1 ! Save return value 213 mov #11,r0 ! Check if small number of bytes 214 cmp/hs r0,COUNT 215 ! COUNT becomes src end address 216 SL(bf, L_small, add SRC,COUNT) 217 mov #1,r1 218 tst r1,SRC ! check if source even 219 SL(bt, L_even, mov COUNT,r7) 220 mov.b @SRC+,r0 ! no, make it even. 221 mov.b r0,@DST 222 add #1,DST 223L_even: tst r1,DST ! check if destination is even 224 add #-3,r7 225 SL(bf, L_odddst, mov #2,r1) 226 tst r1,DST ! check if destination is 4-byte aligned 227 mov DST,r0 228 SL(bt, L_al4dst, sub SRC,r0) 229 mov.w @SRC+,TMP0 230 mov.w TMP0,@DST 231 ! add #2,DST DST is dead here. 232L_al4dst: 233 tst r1,SRC 234 bt L_al4both 235 mov.w @SRC+,r1 236 swap.w r1,r1 237 add #-6,r0 238 add #-6,r7 ! r7 := src end address minus 9. 239 .align 2 240L_2l_loop: 241 mov.l @SRC+,TMP0 ! Read & write two longwords per iteration 242 xtrct TMP0,r1 243 mov.l r1,@(r0,SRC) 244 cmp/hs r7,SRC 245 mov.l @SRC+,r1 246 xtrct r1,TMP0 247 mov.l TMP0,@(r0,SRC) 248 bf L_2l_loop 249 add #-2,SRC 250 bra L_cleanup 251 add #5,r0 252L_al4both: 253 add #-4,r0 254 .align 2 255L_al4both_loop: 256 mov.l @SRC+,DST ! Read longword, write longword per iteration 257 cmp/hs r7,SRC 258 SL(bf, L_al4both_loop, mov.l DST,@(r0,SRC)) 259 260 bra L_cleanup 261 add #3,r0 262 263L_odddst: 264 tst r1,SRC 265 SL(bt, L_al4src, add #-1,DST) 266 mov.w @SRC+,r0 267 mov.b r0,@(1,DST) 268 shlr8 r0 269 mov.b r0,@(2,DST) 270 add #2,DST 271L_al4src: 272 .align 2 273L_odd_loop: 274 mov.l @SRC+,r0 ! Read longword, write byte, word, byte per iteration 275 cmp/hs r7,SRC 276 mov.b r0,@(1,DST) 277 shlr8 r0 278 mov.w r0,@(2,DST) 279 shlr16 r0 280 mov.b r0,@(4,DST) 281 SL(bf, L_odd_loop, add #4,DST) 282 .align 2 ! avoid nop in more frequently executed code. 283L_cleanup2: 284 mov DST,r0 285 sub SRC,r0 286L_cleanup: 287 cmp/eq COUNT,SRC 288 bt L_ready 289 .align 2 290L_cleanup_loop: 291 mov.b @SRC+,r1 292 cmp/eq COUNT,SRC 293 mov.b r1,@(r0,SRC) 294 bf L_cleanup_loop 295L_ready: 296 rts 297 mov TMP1,RESULT 298L_small: 299 bra L_cleanup2 300 add #-1,DST 301#else /* ! __LITTLE_ENDIAN__ */ 302 ! Big endian version copies with decreasing addresses. 303 mov DST,r0 304 add COUNT,r0 305 sub DST,SRC 306 mov #11,r1 307 cmp/hs r1,COUNT 308 SL(bf, L_small, add #-1,SRC) 309 mov SRC,TMP1 310 add r0,TMP1 311 shlr TMP1 312 SL(bt, L_even, 313 mov DST,r7) 314 mov.b @(r0,SRC),TMP0 315 add #-1,TMP1 316 mov.b TMP0,@-r0 317L_even: 318 tst #1,r0 319 add #-1,SRC 320 SL(bf, L_odddst, add #8,r7) 321 tst #2,r0 322 bt L_al4dst 323 add #-1,TMP1 324 mov.w @(r0,SRC),r1 325 mov.w r1,@-r0 326L_al4dst: 327 shlr TMP1 328 bt L_al4both 329 mov.w @(r0,SRC),r1 330 swap.w r1,r1 331 add #4,r7 332 add #-4,SRC 333 .align 2 334L_2l_loop: 335 mov.l @(r0,SRC),TMP0 336 xtrct TMP0,r1 337 mov.l r1,@-r0 338 cmp/hs r7,r0 339 mov.l @(r0,SRC),r1 340 xtrct r1,TMP0 341 mov.l TMP0,@-r0 342 bt L_2l_loop 343 bra L_cleanup 344 add #5,SRC 345 346 nop ! avoid nop in executed code. 347L_al4both: 348 add #-2,SRC 349 .align 2 350L_al4both_loop: 351 mov.l @(r0,SRC),r1 352 cmp/hs r7,r0 353 SL(bt, L_al4both_loop, 354 mov.l r1,@-r0) 355 bra L_cleanup 356 add #3,SRC 357 358 nop ! avoid nop in executed code. 359L_odddst: 360 shlr TMP1 361 bt L_al4src 362 mov.w @(r0,SRC),r1 363 mov.b r1,@-r0 364 shlr8 r1 365 mov.b r1,@-r0 366L_al4src: 367 add #-2,SRC 368 .align 2 369L_odd_loop: 370 mov.l @(r0,SRC),TMP0 371 cmp/hs r7,r0 372 mov.b TMP0,@-r0 373 shlr8 TMP0 374 mov.w TMP0,@-r0 375 shlr16 TMP0 376 mov.b TMP0,@-r0 377 bt L_odd_loop 378 379 add #3,SRC 380L_cleanup: 381L_small: 382 cmp/eq DST,r0 383 bt L_ready 384 add #1,DST 385 .align 2 386L_cleanup_loop: 387 mov.b @(r0,SRC),TMP0 388 cmp/eq DST,r0 389 mov.b TMP0,@-r0 390 bf L_cleanup_loop 391L_ready: 392 rts 393 mov r0,RESULT 394#endif /* ! __LITTLE_ENDIAN__ */ 395#endif /* ! SHMEDIA */ 396