1/* Copyright 2003 SuperH Ltd. */ 2 3#include <picolibc.h> 4 5#include "asm.h" 6 7#ifdef __SH5__ 8#if __SHMEDIA__ 9 10#ifdef __LITTLE_ENDIAN__ 11#define ZPAD_MASK(src, dst) addi src, -1, dst 12#else 13#define ZPAD_MASK(src, dst) \ 14 byterev src, dst; addi dst, -1, dst; byterev dst, dst 15#endif 16 17 18/* We assume that the destination is not in the first 16 bytes of memory. 19 A typical linker script will put the text section first, and as 20 this code is longer that 16 bytes, you have to get out of your way 21 to put data there. */ 22ENTRY(strncpy) 23 pt L_small, tr2 24 ldlo.q r3, 0, r0 25 shlli r3, 3, r19 26 mcmpeq.b r0, r63, r1 27 SHHI r1, r19, r7 28 add r2, r4, r20 29 addi r20, -8, r5 30 /* If the size is greater than 8, we know we can read beyond the first 31 (possibly partial) quadword, and write out a full first and last 32 (possibly unaligned and/or overlapping) quadword. */ 33 bge/u r2, r5, tr2 // L_small 34 pt L_found0, tr0 35 addi r2, 8, r22 36 bnei/u r7, 0, tr0 // L_found0 37 ori r3, -8, r38 38 pt L_end_early, tr1 39 sub r2, r38, r22 40 stlo.q r2, 0, r0 41 sthi.q r2, 7, r0 42 sub r3, r2, r6 43 ldx.q r22, r6, r0 44 /* Before each iteration, check that we can store in full the next quad we 45 are about to fetch. */ 46 addi r5, -8, r36 47 bgtu/u r22, r36, tr1 // L_end_early 48 pt L_scan0, tr1 49L_scan0: 50 addi r22, 8, r22 51 mcmpeq.b r0, r63, r1 52 stlo.q r22, -8, r0 53 bnei/u r1, 0, tr0 // L_found0 54 sthi.q r22, -1, r0 55 ldx.q r22, r6, r0 56 bgeu/l r36, r22, tr1 // L_scan0 57L_end: 58 // At end; we might re-read a few bytes when we fetch the last quad. 59 // branch mispredict, so load is ready now. 60 mcmpeq.b r0, r63, r1 61 addi r22, 8, r22 62 bnei/u r1, 0, tr0 // L_found0 63 add r3, r4, r7 64 ldlo.q r7, -8, r1 65 ldhi.q r7, -1, r7 66 ptabs r18, tr0 67 stlo.q r22, -8, r0 68 or r1, r7, r1 69 mcmpeq.b r1, r63, r7 70 sthi.q r22, -1, r0 71 ZPAD_MASK (r7, r7) 72 and r1, r7, r1 // mask out non-zero bytes after first zero byte 73 stlo.q r20, -8, r1 74 sthi.q r20, -1, r1 75 blink tr0, r63 76 77L_end_early: 78 /* Check if we can store the current quad in full. */ 79 pt L_end, tr1 80 add r3, r4, r7 81 bgtu/u r5, r22, tr1 // L_end // Not really unlikely, but gap is short. 82 /* If not, that means we can just proceed to process the last quad. 83 Two pipeline stalls are unavoidable, as we don't have enough ILP. */ 84 ldlo.q r7, -8, r1 85 ldhi.q r7, -1, r7 86 ptabs r18, tr0 87 or r1, r7, r1 88 mcmpeq.b r1, r63, r7 89 ZPAD_MASK (r7, r7) 90 and r1, r7, r1 // mask out non-zero bytes after first zero byte 91 stlo.q r20, -8, r1 92 sthi.q r20, -1, r1 93 blink tr0, r63 94 95L_found0: 96 // r0: string to store, not yet zero-padding normalized. 97 // r1: result of mcmpeq.b r0, r63, r1. 98 // r22: store address plus 8. I.e. address where zero padding beyond the 99 // string in r0 goes. 100 // r20: store end address. 101 // r5: store end address minus 8. 102 pt L_write0_multiquad, tr0 103 ZPAD_MASK (r1, r1) 104 and r0, r1, r0 // mask out non-zero bytes after first zero byte 105 stlo.q r22, -8, r0 106 sthi.q r22, -1, r0 107 andi r22, -8, r1 // Check if zeros to write fit in one quad word. 108 bgtu/l r5, r1, tr0 // L_write0_multiquad 109 ptabs r18, tr1 110 sub r20, r22, r1 111 shlli r1, 2, r1 // Do shift in two steps so that 64 bit case is 112 SHLO r0, r1, r0 // handled correctly. 113 SHLO r0, r1, r0 114 sthi.q r20, -1, r0 115 blink tr1, r63 116 117L_write0_multiquad: 118 pt L_write0_loop, tr0 119 ptabs r18, tr1 120 stlo.q r22, 0, r63 121 sthi.q r20, -1, r63 122 addi r1, 8, r1 123 bgeu/l r5, r1, tr0 // L_write0_loop 124 blink tr1, r63 125 126L_write0_loop: 127 st.q r1, 0 ,r63 128 addi r1, 8, r1 129 bgeu/l r5, r1, tr0 // L_write0_loop 130 blink tr1, r63 131 132L_small: 133 // r0: string to store, not yet zero-padding normalized. 134 // r1: result of mcmpeq.b r0, r63, r1. 135 // r7: nonzero indicates relevant zero found r0. 136 // r2: store address. 137 // r3: read address. 138 // r4: size, max 8 139 // r20: store end address. 140 // r5: store end address minus 8. 141 pt L_nohi, tr0 142 pt L_small_storelong, tr1 143 ptabs r18, tr2 144 sub r63, r4, r23 145 bnei/u r7, 0, tr0 // L_nohi 146 ori r3, -8, r7 147 bge/l r23, r7, tr0 // L_nohi 148 ldhi.q r3, 7, r1 149 or r0, r1, r0 150 mcmpeq.b r0, r63, r1 151L_nohi: 152 ZPAD_MASK (r1, r1) 153 and r0, r1, r0 154 movi 4, r19 155 bge/u r4, r19, tr1 // L_small_storelong 156 157 pt L_small_end, tr0 158#ifndef __LITTLE_ENDIAN__ 159 byterev r0, r0 160#endif 161 beqi/u r4, 0, tr0 // L_small_end 162 st.b r2, 0, r0 163 beqi/u r4, 1, tr0 // L_small_end 164 shlri r0, 8, r0 165 st.b r2, 1, r0 166 beqi/u r4, 2, tr0 // L_small_end 167 shlri r0, 8, r0 168 st.b r2, 2, r0 169L_small_end: 170 blink tr2, r63 171 172L_small_storelong: 173 shlli r23, 3, r7 174 SHHI r0, r7, r1 175#ifdef __LITTLE_ENDIAN__ 176 shlri r1, 32, r1 177#else 178 shlri r0, 32, r0 179#endif 180 stlo.l r2, 0, r0 181 sthi.l r2, 3, r0 182 stlo.l r20, -4, r1 183 sthi.l r20, -1, r1 184 blink tr2, r63 185 186#else /* SHcompact */ 187 188/* This code is optimized for size. Instruction selection is SH5 specific. 189 SH4 should use a different version. */ 190ENTRY(strncpy) 191 mov #0, r6 192 cmp/eq r4, r6 193 bt return 194 mov r2, r5 195 add #-1, r5 196 add r5, r4 197loop: 198 bt/s found0 199 add #1, r5 200 mov.b @r3+, r1 201found0: 202 cmp/eq r5,r4 203 mov.b r1, @r5 204 bf/s loop 205 cmp/eq r1, r6 206return: 207 rts 208 nop 209 210#endif /* SHcompact */ 211#endif /* __SH5__ */ 212