1 // Copyright 2018 Ulf Adams
2 //
3 // The contents of this file may be used under the terms of the Apache License,
4 // Version 2.0.
5 //
6 // (See accompanying file LICENSE-Apache or copy at
7 // http://www.apache.org/licenses/LICENSE-2.0)
8 //
9 // Alternatively, the contents of this file may be used under the terms of
10 // the Boost Software License, Version 1.0.
11 // (See accompanying file LICENSE-Boost or copy at
12 // https://www.boost.org/LICENSE_1_0.txt)
13 //
14 // Unless required by applicable law or agreed to in writing, this software
15 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, either express or implied.
17 #ifndef RYU_D2S_INTRINSICS_H
18 #define RYU_D2S_INTRINSICS_H
19
20 #include <stdint.h>
21
22 // Defines RYU_32_BIT_PLATFORM if applicable.
23 #include "common.h"
24
25 // ABSL avoids uint128_t on Win32 even if __SIZEOF_INT128__ is defined.
26 // Let's do the same for now.
27 #if defined(__SIZEOF_INT128__) && !defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS)
28 #define HAS_UINT128
29 #elif defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS) && defined(_M_X64)
30 #define HAS_64_BIT_INTRINSICS
31 #endif
32
33 #if defined(HAS_UINT128)
34 typedef __uint128_t uint128_t;
35 #endif
36
37 #if defined(HAS_64_BIT_INTRINSICS)
38
39 #include <intrin.h>
40
umul128(const uint64_t a,const uint64_t b,uint64_t * const productHi)41 static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
42 return _umul128(a, b, productHi);
43 }
44
shiftright128(const uint64_t lo,const uint64_t hi,const uint32_t dist)45 static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
46 // For the __shiftright128 intrinsic, the shift value is always
47 // modulo 64.
48 // In the current implementation of the double-precision version
49 // of Ryu, the shift value is always < 64. (In the case
50 // RYU_OPTIMIZE_SIZE == 0, the shift value is in the range [49, 58].
51 // Otherwise in the range [2, 59].)
52 // However, this function is now also called by s2d, which requires supporting
53 // the larger shift range (TODO: what is the actual range?).
54 // Check this here in case a future change requires larger shift
55 // values. In this case this function needs to be adjusted.
56 assert(dist < 64);
57 return __shiftright128(lo, hi, (unsigned char) dist);
58 }
59
60 #else // defined(HAS_64_BIT_INTRINSICS)
61
62 uint64_t __umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi);
63 #define umul128(a,b,hi) __umul128(a,b,hi)
64
65 uint64_t __shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist);
66 #define shiftright128(lo,hi,dist) __shiftright128(lo,hi,dist)
67
68 #endif // defined(HAS_64_BIT_INTRINSICS)
69
70 #if defined(RYU_32_BIT_PLATFORM)
71
72 // Returns the high 64 bits of the 128-bit product of a and b.
umulh(const uint64_t a,const uint64_t b)73 static inline uint64_t umulh(const uint64_t a, const uint64_t b) {
74 // Reuse the umul128 implementation.
75 // Optimizers will likely eliminate the instructions used to compute the
76 // low part of the product.
77 uint64_t hi;
78 umul128(a, b, &hi);
79 return hi;
80 }
81
82 // On 32-bit platforms, compilers typically generate calls to library
83 // functions for 64-bit divisions, even if the divisor is a constant.
84 //
85 // E.g.:
86 // https://bugs.llvm.org/show_bug.cgi?id=37932
87 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=17958
88 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443
89 //
90 // The functions here perform division-by-constant using multiplications
91 // in the same way as 64-bit compilers would do.
92 //
93 // NB:
94 // The multipliers and shift values are the ones generated by clang x64
95 // for expressions like x/5, x/10, etc.
96
div5(const uint64_t x)97 static inline uint64_t div5(const uint64_t x) {
98 return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 2;
99 }
100
div10(const uint64_t x)101 static inline uint64_t div10(const uint64_t x) {
102 return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 3;
103 }
104
div100(const uint64_t x)105 static inline uint64_t div100(const uint64_t x) {
106 return umulh(x >> 2, 0x28F5C28F5C28F5C3u) >> 2;
107 }
108
div1e8(const uint64_t x)109 static inline uint64_t div1e8(const uint64_t x) {
110 return umulh(x, 0xABCC77118461CEFDu) >> 26;
111 }
112
div1e9(const uint64_t x)113 static inline uint64_t div1e9(const uint64_t x) {
114 return umulh(x >> 9, 0x44B82FA09B5A53u) >> 11;
115 }
116
mod1e9(const uint64_t x)117 static inline uint32_t mod1e9(const uint64_t x) {
118 // Avoid 64-bit math as much as possible.
119 // Returning (uint32_t) (x - 1000000000 * div1e9(x)) would
120 // perform 32x64-bit multiplication and 64-bit subtraction.
121 // x and 1000000000 * div1e9(x) are guaranteed to differ by
122 // less than 10^9, so their highest 32 bits must be identical,
123 // so we can truncate both sides to uint32_t before subtracting.
124 // We can also simplify (uint32_t) (1000000000 * div1e9(x)).
125 // We can truncate before multiplying instead of after, as multiplying
126 // the highest 32 bits of div1e9(x) can't affect the lowest 32 bits.
127 return ((uint32_t) x) - 1000000000 * ((uint32_t) div1e9(x));
128 }
129
130 #else // defined(RYU_32_BIT_PLATFORM)
131
div5(const uint64_t x)132 static inline uint64_t div5(const uint64_t x) {
133 return x / 5;
134 }
135
div10(const uint64_t x)136 static inline uint64_t div10(const uint64_t x) {
137 return x / 10;
138 }
139
div100(const uint64_t x)140 static inline uint64_t div100(const uint64_t x) {
141 return x / 100;
142 }
143
div1e8(const uint64_t x)144 static inline uint64_t div1e8(const uint64_t x) {
145 return x / 100000000;
146 }
147
div1e9(const uint64_t x)148 static inline uint64_t div1e9(const uint64_t x) {
149 return x / 1000000000;
150 }
151
mod1e9(const uint64_t x)152 static inline uint32_t mod1e9(const uint64_t x) {
153 return (uint32_t) (x - 1000000000 * div1e9(x));
154 }
155
156 #endif // defined(RYU_32_BIT_PLATFORM)
157
158 uint32_t __pow5Factor(uint64_t value);
159 #define pow5Factor(v) __pow5Factor(v)
160
161 // Returns true if value is divisible by 5^p.
multipleOfPowerOf5(const uint64_t value,const uint32_t p)162 static inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
163 // I tried a case distinction on p, but there was no performance difference.
164 return pow5Factor(value) >= p;
165 }
166
167 // Returns true if value is divisible by 2^p.
multipleOfPowerOf2(const uint64_t value,const uint32_t p)168 static inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
169 assert(value != 0);
170 assert(p < 64);
171 // __builtin_ctzll doesn't appear to be faster here.
172 return (value & ((1ull << p) - 1)) == 0;
173 }
174
175 // We need a 64x128-bit multiplication and a subsequent 128-bit shift.
176 // Multiplication:
177 // The 64-bit factor is variable and passed in, the 128-bit factor comes
178 // from a lookup table. We know that the 64-bit factor only has 55
179 // significant bits (i.e., the 9 topmost bits are zeros). The 128-bit
180 // factor only has 124 significant bits (i.e., the 4 topmost bits are
181 // zeros).
182 // Shift:
183 // In principle, the multiplication result requires 55 + 124 = 179 bits to
184 // represent. However, we then shift this value to the right by j, which is
185 // at least j >= 115, so the result is guaranteed to fit into 179 - 115 = 64
186 // bits. This means that we only need the topmost 64 significant bits of
187 // the 64x128-bit multiplication.
188 //
189 // There are several ways to do this:
190 // 1. Best case: the compiler exposes a 128-bit type.
191 // We perform two 64x64-bit multiplications, add the higher 64 bits of the
192 // lower result to the higher result, and shift by j - 64 bits.
193 //
194 // We explicitly cast from 64-bit to 128-bit, so the compiler can tell
195 // that these are only 64-bit inputs, and can map these to the best
196 // possible sequence of assembly instructions.
197 // x64 machines happen to have matching assembly instructions for
198 // 64x64-bit multiplications and 128-bit shifts.
199 //
200 // 2. Second best case: the compiler exposes intrinsics for the x64 assembly
201 // instructions mentioned in 1.
202 //
203 // 3. We only have 64x64 bit instructions that return the lower 64 bits of
204 // the result, i.e., we have to use plain C.
205 // Our inputs are less than the full width, so we have three options:
206 // a. Ignore this fact and just implement the intrinsics manually.
207 // b. Split both into 31-bit pieces, which guarantees no internal overflow,
208 // but requires extra work upfront (unless we change the lookup table).
209 // c. Split only the first factor into 31-bit pieces, which also guarantees
210 // no internal overflow, but requires extra work since the intermediate
211 // results are not perfectly aligned.
212 #if defined(HAS_UINT128)
213
214 // Best case: use 128-bit type.
mulShift64(const uint64_t m,const uint64_t * const mul,const int32_t j)215 static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
216 const uint128_t b0 = ((uint128_t) m) * mul[0];
217 const uint128_t b2 = ((uint128_t) m) * mul[1];
218 return (uint64_t) (((b0 >> 64) + b2) >> (j - 64));
219 }
220
mulShiftAll64(const uint64_t m,const uint64_t * const mul,const int32_t j,uint64_t * const vp,uint64_t * const vm,const uint32_t mmShift)221 static inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
222 uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
223 // m <<= 2;
224 // uint128_t b0 = ((uint128_t) m) * mul[0]; // 0
225 // uint128_t b2 = ((uint128_t) m) * mul[1]; // 64
226 //
227 // uint128_t hi = (b0 >> 64) + b2;
228 // uint128_t lo = b0 & 0xffffffffffffffffull;
229 // uint128_t factor = (((uint128_t) mul[1]) << 64) + mul[0];
230 // uint128_t vpLo = lo + (factor << 1);
231 // *vp = (uint64_t) ((hi + (vpLo >> 64)) >> (j - 64));
232 // uint128_t vmLo = lo - (factor << mmShift);
233 // *vm = (uint64_t) ((hi + (vmLo >> 64) - (((uint128_t) 1ull) << 64)) >> (j - 64));
234 // return (uint64_t) (hi >> (j - 64));
235 *vp = mulShift64(4 * m + 2, mul, j);
236 *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
237 return mulShift64(4 * m, mul, j);
238 }
239
240 #elif defined(HAS_64_BIT_INTRINSICS)
241
mulShift64(const uint64_t m,const uint64_t * const mul,const int32_t j)242 static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
243 // m is maximum 55 bits
244 uint64_t high1; // 128
245 const uint64_t low1 = umul128(m, mul[1], &high1); // 64
246 uint64_t high0; // 64
247 umul128(m, mul[0], &high0); // 0
248 const uint64_t sum = high0 + low1;
249 if (sum < high0) {
250 ++high1; // overflow into high1
251 }
252 return shiftright128(sum, high1, j - 64);
253 }
254
mulShiftAll64(const uint64_t m,const uint64_t * const mul,const int32_t j,uint64_t * const vp,uint64_t * const vm,const uint32_t mmShift)255 static inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
256 uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
257 *vp = mulShift64(4 * m + 2, mul, j);
258 *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
259 return mulShift64(4 * m, mul, j);
260 }
261
262 #else // !defined(HAS_UINT128) && !defined(HAS_64_BIT_INTRINSICS)
263
mulShift64(const uint64_t m,const uint64_t * const mul,const int32_t j)264 static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
265 // m is maximum 55 bits
266 uint64_t high1; // 128
267 const uint64_t low1 = umul128(m, mul[1], &high1); // 64
268 uint64_t high0; // 64
269 umul128(m, mul[0], &high0); // 0
270 const uint64_t sum = high0 + low1;
271 if (sum < high0) {
272 ++high1; // overflow into high1
273 }
274 return shiftright128(sum, high1, j - 64);
275 }
276
277 // This is faster if we don't have a 64x64->128-bit multiplication.
mulShiftAll64(uint64_t m,const uint64_t * const mul,const int32_t j,uint64_t * const vp,uint64_t * const vm,const uint32_t mmShift)278 static inline uint64_t mulShiftAll64(uint64_t m, const uint64_t* const mul, const int32_t j,
279 uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
280 m <<= 1;
281 // m is maximum 55 bits
282 uint64_t tmp;
283 const uint64_t lo = umul128(m, mul[0], &tmp);
284 uint64_t hi;
285 const uint64_t mid = tmp + umul128(m, mul[1], &hi);
286 hi += mid < tmp; // overflow into hi
287
288 const uint64_t lo2 = lo + mul[0];
289 const uint64_t mid2 = mid + mul[1] + (lo2 < lo);
290 const uint64_t hi2 = hi + (mid2 < mid);
291 *vp = shiftright128(mid2, hi2, (uint32_t) (j - 64 - 1));
292
293 if (mmShift == 1) {
294 const uint64_t lo3 = lo - mul[0];
295 const uint64_t mid3 = mid - mul[1] - (lo3 > lo);
296 const uint64_t hi3 = hi - (mid3 > mid);
297 *vm = shiftright128(mid3, hi3, (uint32_t) (j - 64 - 1));
298 } else {
299 const uint64_t lo3 = lo + lo;
300 const uint64_t mid3 = mid + mid + (lo3 < lo);
301 const uint64_t hi3 = hi + hi + (mid3 < mid);
302 const uint64_t lo4 = lo3 - mul[0];
303 const uint64_t mid4 = mid3 - mul[1] - (lo4 > lo3);
304 const uint64_t hi4 = hi3 - (mid4 > mid3);
305 *vm = shiftright128(mid4, hi4, (uint32_t) (j - 64));
306 }
307
308 return shiftright128(mid, hi, (uint32_t) (j - 64 - 1));
309 }
310
311 #endif // HAS_64_BIT_INTRINSICS
312
313 #endif // RYU_D2S_INTRINSICS_H
314