1/* Copyright 2003 SuperH Ltd.  */
2
3#include "asm.h"
4
5#ifdef __SH5__
6#if __SHMEDIA__
7
8#ifdef __LITTLE_ENDIAN__
9#define ZPAD_MASK(src, dst) addi src, -1, dst
10#else
11#define ZPAD_MASK(src, dst) \
12 byterev src, dst; addi dst, -1, dst; byterev dst, dst
13#endif
14
15
16/* We assume that the destination is not in the first 16 bytes of memory.
17   A typical linker script will put the text section first, and as
18   this code is longer that 16 bytes, you have to get out of your way
19    to put data there.  */
20ENTRY(strncpy)
21 pt L_small, tr2
22 ldlo.q r3, 0, r0
23 shlli r3, 3, r19
24 mcmpeq.b r0, r63, r1
25 SHHI r1, r19, r7
26 add r2, r4, r20
27 addi r20, -8, r5
28 /* If the size is greater than 8, we know we can read beyond the first
29    (possibly partial) quadword, and write out a full first and last
30    (possibly unaligned and/or overlapping) quadword.  */
31 bge/u r2, r5, tr2 // L_small
32 pt L_found0, tr0
33 addi r2, 8, r22
34 bnei/u r7, 0, tr0  // L_found0
35 ori r3, -8, r38
36 pt L_end_early, tr1
37 sub r2, r38, r22
38 stlo.q r2, 0, r0
39 sthi.q r2, 7, r0
40 sub r3, r2, r6
41 ldx.q r22, r6, r0
42 /* Before each iteration, check that we can store in full the next quad we
43    are about to fetch.  */
44 addi r5, -8, r36
45 bgtu/u r22, r36, tr1 // L_end_early
46 pt L_scan0, tr1
47L_scan0:
48 addi r22, 8, r22
49 mcmpeq.b r0, r63, r1
50 stlo.q r22, -8, r0
51 bnei/u r1, 0, tr0   // L_found0
52 sthi.q r22, -1, r0
53 ldx.q r22, r6, r0
54 bgeu/l r36, r22, tr1 // L_scan0
55L_end:
56 // At end; we might re-read a few bytes when we fetch the last quad.
57 // branch mispredict, so load is ready now.
58 mcmpeq.b r0, r63, r1
59 addi r22, 8, r22
60 bnei/u r1, 0, tr0   // L_found0
61 add r3, r4, r7
62 ldlo.q r7, -8, r1
63 ldhi.q r7, -1, r7
64 ptabs r18, tr0
65 stlo.q r22, -8, r0
66 or r1, r7, r1
67 mcmpeq.b r1, r63, r7
68 sthi.q r22, -1, r0
69 ZPAD_MASK (r7, r7)
70 and r1, r7, r1 // mask out non-zero bytes after first zero byte
71 stlo.q r20, -8, r1
72 sthi.q r20, -1, r1
73 blink tr0, r63
74
75L_end_early:
76 /* Check if we can store the current quad in full.  */
77 pt L_end, tr1
78 add r3, r4, r7
79 bgtu/u r5, r22, tr1 // L_end // Not really unlikely, but gap is short.
80 /* If not, that means we can just proceed to process the last quad.
81    Two pipeline stalls are unavoidable, as we don't have enough ILP.  */
82 ldlo.q r7, -8, r1
83 ldhi.q r7, -1, r7
84 ptabs r18, tr0
85 or r1, r7, r1
86 mcmpeq.b r1, r63, r7
87 ZPAD_MASK (r7, r7)
88 and r1, r7, r1 // mask out non-zero bytes after first zero byte
89 stlo.q r20, -8, r1
90 sthi.q r20, -1, r1
91 blink tr0, r63
92
93L_found0:
94 // r0: string to store, not yet zero-padding normalized.
95 // r1: result of mcmpeq.b r0, r63, r1.
96 // r22: store address plus 8.  I.e. address where zero padding beyond the
97 //      string in r0 goes.
98 // r20: store end address.
99 // r5: store end address minus 8.
100 pt L_write0_multiquad, tr0
101 ZPAD_MASK (r1, r1)
102 and r0, r1, r0 // mask out non-zero bytes after first zero byte
103 stlo.q r22, -8, r0
104 sthi.q r22, -1, r0
105 andi r22, -8, r1 // Check if zeros to write fit in one quad word.
106 bgtu/l r5, r1, tr0 // L_write0_multiquad
107 ptabs r18, tr1
108 sub r20, r22, r1
109 shlli r1, 2, r1 // Do shift in two steps so that 64 bit case is
110 SHLO r0, r1, r0 // handled correctly.
111 SHLO r0, r1, r0
112 sthi.q r20, -1, r0
113 blink tr1, r63
114
115L_write0_multiquad:
116 pt L_write0_loop, tr0
117 ptabs r18, tr1
118 stlo.q r22, 0, r63
119 sthi.q r20, -1, r63
120 addi r1, 8, r1
121 bgeu/l r5, r1, tr0 // L_write0_loop
122 blink tr1, r63
123
124L_write0_loop:
125 st.q r1, 0 ,r63
126 addi r1, 8, r1
127 bgeu/l r5, r1, tr0 // L_write0_loop
128 blink tr1, r63
129
130L_small:
131 // r0: string to store, not yet zero-padding normalized.
132 // r1: result of mcmpeq.b r0, r63, r1.
133 // r7: nonzero indicates relevant zero found r0.
134 // r2: store address.
135 // r3: read address.
136 // r4: size, max 8
137 // r20: store end address.
138 // r5: store end address minus 8.
139 pt L_nohi, tr0
140 pt L_small_storelong, tr1
141 ptabs r18, tr2
142 sub r63, r4, r23
143 bnei/u r7, 0, tr0  // L_nohi
144 ori r3, -8, r7
145 bge/l r23, r7, tr0 // L_nohi
146 ldhi.q r3, 7, r1
147 or r0, r1, r0
148 mcmpeq.b r0, r63, r1
149L_nohi:
150 ZPAD_MASK (r1, r1)
151 and r0, r1, r0
152 movi 4, r19
153 bge/u r4, r19, tr1 // L_small_storelong
154
155 pt L_small_end, tr0
156#ifndef __LITTLE_ENDIAN__
157 byterev r0, r0
158#endif
159 beqi/u r4, 0, tr0 // L_small_end
160 st.b r2, 0, r0
161 beqi/u r4, 1, tr0 // L_small_end
162 shlri r0, 8, r0
163 st.b r2, 1, r0
164 beqi/u r4, 2, tr0 // L_small_end
165 shlri r0, 8, r0
166 st.b r2, 2, r0
167L_small_end:
168 blink tr2, r63
169
170L_small_storelong:
171 shlli r23, 3, r7
172 SHHI r0, r7, r1
173#ifdef __LITTLE_ENDIAN__
174 shlri r1, 32, r1
175#else
176 shlri r0, 32, r0
177#endif
178 stlo.l r2, 0, r0
179 sthi.l r2, 3, r0
180 stlo.l r20, -4, r1
181 sthi.l r20, -1, r1
182 blink tr2, r63
183
184#else /* SHcompact */
185
186/* This code is optimized for size.  Instruction selection is SH5 specific.
187   SH4 should use a different version.  */
188ENTRY(strncpy)
189 mov #0, r6
190 cmp/eq r4, r6
191 bt return
192 mov r2, r5
193 add #-1, r5
194 add r5, r4
195loop:
196 bt/s found0
197 add #1, r5
198 mov.b @r3+, r1
199found0:
200 cmp/eq r5,r4
201 mov.b r1, @r5
202 bf/s loop
203 cmp/eq r1, r6
204return:
205 rts
206 nop
207
208#endif /* SHcompact */
209#endif /* __SH5__ */
210