1/* Copyright 2003 SuperH Ltd. */ 2 3#include "asm.h" 4 5#ifdef __SH5__ 6#if __SHMEDIA__ 7 8#ifdef __LITTLE_ENDIAN__ 9#define ZPAD_MASK(src, dst) addi src, -1, dst 10#else 11#define ZPAD_MASK(src, dst) \ 12 byterev src, dst; addi dst, -1, dst; byterev dst, dst 13#endif 14 15 16/* We assume that the destination is not in the first 16 bytes of memory. 17 A typical linker script will put the text section first, and as 18 this code is longer that 16 bytes, you have to get out of your way 19 to put data there. */ 20ENTRY(strncpy) 21 pt L_small, tr2 22 ldlo.q r3, 0, r0 23 shlli r3, 3, r19 24 mcmpeq.b r0, r63, r1 25 SHHI r1, r19, r7 26 add r2, r4, r20 27 addi r20, -8, r5 28 /* If the size is greater than 8, we know we can read beyond the first 29 (possibly partial) quadword, and write out a full first and last 30 (possibly unaligned and/or overlapping) quadword. */ 31 bge/u r2, r5, tr2 // L_small 32 pt L_found0, tr0 33 addi r2, 8, r22 34 bnei/u r7, 0, tr0 // L_found0 35 ori r3, -8, r38 36 pt L_end_early, tr1 37 sub r2, r38, r22 38 stlo.q r2, 0, r0 39 sthi.q r2, 7, r0 40 sub r3, r2, r6 41 ldx.q r22, r6, r0 42 /* Before each iteration, check that we can store in full the next quad we 43 are about to fetch. */ 44 addi r5, -8, r36 45 bgtu/u r22, r36, tr1 // L_end_early 46 pt L_scan0, tr1 47L_scan0: 48 addi r22, 8, r22 49 mcmpeq.b r0, r63, r1 50 stlo.q r22, -8, r0 51 bnei/u r1, 0, tr0 // L_found0 52 sthi.q r22, -1, r0 53 ldx.q r22, r6, r0 54 bgeu/l r36, r22, tr1 // L_scan0 55L_end: 56 // At end; we might re-read a few bytes when we fetch the last quad. 57 // branch mispredict, so load is ready now. 58 mcmpeq.b r0, r63, r1 59 addi r22, 8, r22 60 bnei/u r1, 0, tr0 // L_found0 61 add r3, r4, r7 62 ldlo.q r7, -8, r1 63 ldhi.q r7, -1, r7 64 ptabs r18, tr0 65 stlo.q r22, -8, r0 66 or r1, r7, r1 67 mcmpeq.b r1, r63, r7 68 sthi.q r22, -1, r0 69 ZPAD_MASK (r7, r7) 70 and r1, r7, r1 // mask out non-zero bytes after first zero byte 71 stlo.q r20, -8, r1 72 sthi.q r20, -1, r1 73 blink tr0, r63 74 75L_end_early: 76 /* Check if we can store the current quad in full. */ 77 pt L_end, tr1 78 add r3, r4, r7 79 bgtu/u r5, r22, tr1 // L_end // Not really unlikely, but gap is short. 80 /* If not, that means we can just proceed to process the last quad. 81 Two pipeline stalls are unavoidable, as we don't have enough ILP. */ 82 ldlo.q r7, -8, r1 83 ldhi.q r7, -1, r7 84 ptabs r18, tr0 85 or r1, r7, r1 86 mcmpeq.b r1, r63, r7 87 ZPAD_MASK (r7, r7) 88 and r1, r7, r1 // mask out non-zero bytes after first zero byte 89 stlo.q r20, -8, r1 90 sthi.q r20, -1, r1 91 blink tr0, r63 92 93L_found0: 94 // r0: string to store, not yet zero-padding normalized. 95 // r1: result of mcmpeq.b r0, r63, r1. 96 // r22: store address plus 8. I.e. address where zero padding beyond the 97 // string in r0 goes. 98 // r20: store end address. 99 // r5: store end address minus 8. 100 pt L_write0_multiquad, tr0 101 ZPAD_MASK (r1, r1) 102 and r0, r1, r0 // mask out non-zero bytes after first zero byte 103 stlo.q r22, -8, r0 104 sthi.q r22, -1, r0 105 andi r22, -8, r1 // Check if zeros to write fit in one quad word. 106 bgtu/l r5, r1, tr0 // L_write0_multiquad 107 ptabs r18, tr1 108 sub r20, r22, r1 109 shlli r1, 2, r1 // Do shift in two steps so that 64 bit case is 110 SHLO r0, r1, r0 // handled correctly. 111 SHLO r0, r1, r0 112 sthi.q r20, -1, r0 113 blink tr1, r63 114 115L_write0_multiquad: 116 pt L_write0_loop, tr0 117 ptabs r18, tr1 118 stlo.q r22, 0, r63 119 sthi.q r20, -1, r63 120 addi r1, 8, r1 121 bgeu/l r5, r1, tr0 // L_write0_loop 122 blink tr1, r63 123 124L_write0_loop: 125 st.q r1, 0 ,r63 126 addi r1, 8, r1 127 bgeu/l r5, r1, tr0 // L_write0_loop 128 blink tr1, r63 129 130L_small: 131 // r0: string to store, not yet zero-padding normalized. 132 // r1: result of mcmpeq.b r0, r63, r1. 133 // r7: nonzero indicates relevant zero found r0. 134 // r2: store address. 135 // r3: read address. 136 // r4: size, max 8 137 // r20: store end address. 138 // r5: store end address minus 8. 139 pt L_nohi, tr0 140 pt L_small_storelong, tr1 141 ptabs r18, tr2 142 sub r63, r4, r23 143 bnei/u r7, 0, tr0 // L_nohi 144 ori r3, -8, r7 145 bge/l r23, r7, tr0 // L_nohi 146 ldhi.q r3, 7, r1 147 or r0, r1, r0 148 mcmpeq.b r0, r63, r1 149L_nohi: 150 ZPAD_MASK (r1, r1) 151 and r0, r1, r0 152 movi 4, r19 153 bge/u r4, r19, tr1 // L_small_storelong 154 155 pt L_small_end, tr0 156#ifndef __LITTLE_ENDIAN__ 157 byterev r0, r0 158#endif 159 beqi/u r4, 0, tr0 // L_small_end 160 st.b r2, 0, r0 161 beqi/u r4, 1, tr0 // L_small_end 162 shlri r0, 8, r0 163 st.b r2, 1, r0 164 beqi/u r4, 2, tr0 // L_small_end 165 shlri r0, 8, r0 166 st.b r2, 2, r0 167L_small_end: 168 blink tr2, r63 169 170L_small_storelong: 171 shlli r23, 3, r7 172 SHHI r0, r7, r1 173#ifdef __LITTLE_ENDIAN__ 174 shlri r1, 32, r1 175#else 176 shlri r0, 32, r0 177#endif 178 stlo.l r2, 0, r0 179 sthi.l r2, 3, r0 180 stlo.l r20, -4, r1 181 sthi.l r20, -1, r1 182 blink tr2, r63 183 184#else /* SHcompact */ 185 186/* This code is optimized for size. Instruction selection is SH5 specific. 187 SH4 should use a different version. */ 188ENTRY(strncpy) 189 mov #0, r6 190 cmp/eq r4, r6 191 bt return 192 mov r2, r5 193 add #-1, r5 194 add r5, r4 195loop: 196 bt/s found0 197 add #1, r5 198 mov.b @r3+, r1 199found0: 200 cmp/eq r5,r4 201 mov.b r1, @r5 202 bf/s loop 203 cmp/eq r1, r6 204return: 205 rts 206 nop 207 208#endif /* SHcompact */ 209#endif /* __SH5__ */ 210