1 // Copyright 2018 Ulf Adams
2 //
3 // The contents of this file may be used under the terms of the Apache License,
4 // Version 2.0.
5 //
6 //    (See accompanying file LICENSE-Apache or copy at
7 //     http://www.apache.org/licenses/LICENSE-2.0)
8 //
9 // Alternatively, the contents of this file may be used under the terms of
10 // the Boost Software License, Version 1.0.
11 //    (See accompanying file LICENSE-Boost or copy at
12 //     https://www.boost.org/LICENSE_1_0.txt)
13 //
14 // Unless required by applicable law or agreed to in writing, this software
15 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 // KIND, either express or implied.
17 #ifndef RYU_D2S_INTRINSICS_H
18 #define RYU_D2S_INTRINSICS_H
19 
20 #include <stdint.h>
21 
22 // Defines RYU_32_BIT_PLATFORM if applicable.
23 #include "common.h"
24 
25 // ABSL avoids uint128_t on Win32 even if __SIZEOF_INT128__ is defined.
26 // Let's do the same for now.
27 #if defined(__SIZEOF_INT128__) && !defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS)
28 #define HAS_UINT128
29 #elif defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS) && defined(_M_X64)
30 #define HAS_64_BIT_INTRINSICS
31 #endif
32 
33 #if defined(HAS_UINT128)
34 typedef __uint128_t uint128_t;
35 #endif
36 
37 #if defined(HAS_64_BIT_INTRINSICS)
38 
39 #include <intrin.h>
40 
umul128(const uint64_t a,const uint64_t b,uint64_t * const productHi)41 static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
42   return _umul128(a, b, productHi);
43 }
44 
shiftright128(const uint64_t lo,const uint64_t hi,const uint32_t dist)45 static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
46   // For the __shiftright128 intrinsic, the shift value is always
47   // modulo 64.
48   // In the current implementation of the double-precision version
49   // of Ryu, the shift value is always < 64. (In the case
50   // RYU_OPTIMIZE_SIZE == 0, the shift value is in the range [49, 58].
51   // Otherwise in the range [2, 59].)
52   // However, this function is now also called by s2d, which requires supporting
53   // the larger shift range (TODO: what is the actual range?).
54   // Check this here in case a future change requires larger shift
55   // values. In this case this function needs to be adjusted.
56   assert(dist < 64);
57   return __shiftright128(lo, hi, (unsigned char) dist);
58 }
59 
60 #else // defined(HAS_64_BIT_INTRINSICS)
61 
62 uint64_t __umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi);
63 #define umul128(a,b,hi) __umul128(a,b,hi)
64 
65 uint64_t __shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist);
66 #define shiftright128(lo,hi,dist) __shiftright128(lo,hi,dist)
67 
68 #endif // defined(HAS_64_BIT_INTRINSICS)
69 
70 #if defined(RYU_32_BIT_PLATFORM)
71 
72 // Returns the high 64 bits of the 128-bit product of a and b.
umulh(const uint64_t a,const uint64_t b)73 static inline uint64_t umulh(const uint64_t a, const uint64_t b) {
74   // Reuse the umul128 implementation.
75   // Optimizers will likely eliminate the instructions used to compute the
76   // low part of the product.
77   uint64_t hi;
78   umul128(a, b, &hi);
79   return hi;
80 }
81 
82 // On 32-bit platforms, compilers typically generate calls to library
83 // functions for 64-bit divisions, even if the divisor is a constant.
84 //
85 // E.g.:
86 // https://bugs.llvm.org/show_bug.cgi?id=37932
87 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=17958
88 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443
89 //
90 // The functions here perform division-by-constant using multiplications
91 // in the same way as 64-bit compilers would do.
92 //
93 // NB:
94 // The multipliers and shift values are the ones generated by clang x64
95 // for expressions like x/5, x/10, etc.
96 
div5(const uint64_t x)97 static inline uint64_t div5(const uint64_t x) {
98   return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 2;
99 }
100 
div10(const uint64_t x)101 static inline uint64_t div10(const uint64_t x) {
102   return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 3;
103 }
104 
div100(const uint64_t x)105 static inline uint64_t div100(const uint64_t x) {
106   return umulh(x >> 2, 0x28F5C28F5C28F5C3u) >> 2;
107 }
108 
div1e8(const uint64_t x)109 static inline uint64_t div1e8(const uint64_t x) {
110   return umulh(x, 0xABCC77118461CEFDu) >> 26;
111 }
112 
div1e9(const uint64_t x)113 static inline uint64_t div1e9(const uint64_t x) {
114   return umulh(x >> 9, 0x44B82FA09B5A53u) >> 11;
115 }
116 
mod1e9(const uint64_t x)117 static inline uint32_t mod1e9(const uint64_t x) {
118   // Avoid 64-bit math as much as possible.
119   // Returning (uint32_t) (x - 1000000000 * div1e9(x)) would
120   // perform 32x64-bit multiplication and 64-bit subtraction.
121   // x and 1000000000 * div1e9(x) are guaranteed to differ by
122   // less than 10^9, so their highest 32 bits must be identical,
123   // so we can truncate both sides to uint32_t before subtracting.
124   // We can also simplify (uint32_t) (1000000000 * div1e9(x)).
125   // We can truncate before multiplying instead of after, as multiplying
126   // the highest 32 bits of div1e9(x) can't affect the lowest 32 bits.
127   return ((uint32_t) x) - 1000000000 * ((uint32_t) div1e9(x));
128 }
129 
130 #else // defined(RYU_32_BIT_PLATFORM)
131 
div5(const uint64_t x)132 static inline uint64_t div5(const uint64_t x) {
133   return x / 5;
134 }
135 
div10(const uint64_t x)136 static inline uint64_t div10(const uint64_t x) {
137   return x / 10;
138 }
139 
div100(const uint64_t x)140 static inline uint64_t div100(const uint64_t x) {
141   return x / 100;
142 }
143 
div1e8(const uint64_t x)144 static inline uint64_t div1e8(const uint64_t x) {
145   return x / 100000000;
146 }
147 
div1e9(const uint64_t x)148 static inline uint64_t div1e9(const uint64_t x) {
149   return x / 1000000000;
150 }
151 
mod1e9(const uint64_t x)152 static inline uint32_t mod1e9(const uint64_t x) {
153   return (uint32_t) (x - 1000000000 * div1e9(x));
154 }
155 
156 #endif // defined(RYU_32_BIT_PLATFORM)
157 
158 uint32_t __pow5Factor(uint64_t value);
159 #define pow5Factor(v) __pow5Factor(v)
160 
161 // Returns true if value is divisible by 5^p.
multipleOfPowerOf5(const uint64_t value,const uint32_t p)162 static inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
163   // I tried a case distinction on p, but there was no performance difference.
164   return pow5Factor(value) >= p;
165 }
166 
167 // Returns true if value is divisible by 2^p.
multipleOfPowerOf2(const uint64_t value,const uint32_t p)168 static inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
169   assert(value != 0);
170   assert(p < 64);
171   // __builtin_ctzll doesn't appear to be faster here.
172   return (value & ((1ull << p) - 1)) == 0;
173 }
174 
175 // We need a 64x128-bit multiplication and a subsequent 128-bit shift.
176 // Multiplication:
177 //   The 64-bit factor is variable and passed in, the 128-bit factor comes
178 //   from a lookup table. We know that the 64-bit factor only has 55
179 //   significant bits (i.e., the 9 topmost bits are zeros). The 128-bit
180 //   factor only has 124 significant bits (i.e., the 4 topmost bits are
181 //   zeros).
182 // Shift:
183 //   In principle, the multiplication result requires 55 + 124 = 179 bits to
184 //   represent. However, we then shift this value to the right by j, which is
185 //   at least j >= 115, so the result is guaranteed to fit into 179 - 115 = 64
186 //   bits. This means that we only need the topmost 64 significant bits of
187 //   the 64x128-bit multiplication.
188 //
189 // There are several ways to do this:
190 // 1. Best case: the compiler exposes a 128-bit type.
191 //    We perform two 64x64-bit multiplications, add the higher 64 bits of the
192 //    lower result to the higher result, and shift by j - 64 bits.
193 //
194 //    We explicitly cast from 64-bit to 128-bit, so the compiler can tell
195 //    that these are only 64-bit inputs, and can map these to the best
196 //    possible sequence of assembly instructions.
197 //    x64 machines happen to have matching assembly instructions for
198 //    64x64-bit multiplications and 128-bit shifts.
199 //
200 // 2. Second best case: the compiler exposes intrinsics for the x64 assembly
201 //    instructions mentioned in 1.
202 //
203 // 3. We only have 64x64 bit instructions that return the lower 64 bits of
204 //    the result, i.e., we have to use plain C.
205 //    Our inputs are less than the full width, so we have three options:
206 //    a. Ignore this fact and just implement the intrinsics manually.
207 //    b. Split both into 31-bit pieces, which guarantees no internal overflow,
208 //       but requires extra work upfront (unless we change the lookup table).
209 //    c. Split only the first factor into 31-bit pieces, which also guarantees
210 //       no internal overflow, but requires extra work since the intermediate
211 //       results are not perfectly aligned.
212 #if defined(HAS_UINT128)
213 
214 // Best case: use 128-bit type.
mulShift64(const uint64_t m,const uint64_t * const mul,const int32_t j)215 static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
216   const uint128_t b0 = ((uint128_t) m) * mul[0];
217   const uint128_t b2 = ((uint128_t) m) * mul[1];
218   return (uint64_t) (((b0 >> 64) + b2) >> (j - 64));
219 }
220 
mulShiftAll64(const uint64_t m,const uint64_t * const mul,const int32_t j,uint64_t * const vp,uint64_t * const vm,const uint32_t mmShift)221 static inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
222   uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
223 //  m <<= 2;
224 //  uint128_t b0 = ((uint128_t) m) * mul[0]; // 0
225 //  uint128_t b2 = ((uint128_t) m) * mul[1]; // 64
226 //
227 //  uint128_t hi = (b0 >> 64) + b2;
228 //  uint128_t lo = b0 & 0xffffffffffffffffull;
229 //  uint128_t factor = (((uint128_t) mul[1]) << 64) + mul[0];
230 //  uint128_t vpLo = lo + (factor << 1);
231 //  *vp = (uint64_t) ((hi + (vpLo >> 64)) >> (j - 64));
232 //  uint128_t vmLo = lo - (factor << mmShift);
233 //  *vm = (uint64_t) ((hi + (vmLo >> 64) - (((uint128_t) 1ull) << 64)) >> (j - 64));
234 //  return (uint64_t) (hi >> (j - 64));
235   *vp = mulShift64(4 * m + 2, mul, j);
236   *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
237   return mulShift64(4 * m, mul, j);
238 }
239 
240 #elif defined(HAS_64_BIT_INTRINSICS)
241 
mulShift64(const uint64_t m,const uint64_t * const mul,const int32_t j)242 static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
243   // m is maximum 55 bits
244   uint64_t high1;                                   // 128
245   const uint64_t low1 = umul128(m, mul[1], &high1); // 64
246   uint64_t high0;                                   // 64
247   umul128(m, mul[0], &high0);                       // 0
248   const uint64_t sum = high0 + low1;
249   if (sum < high0) {
250     ++high1; // overflow into high1
251   }
252   return shiftright128(sum, high1, j - 64);
253 }
254 
mulShiftAll64(const uint64_t m,const uint64_t * const mul,const int32_t j,uint64_t * const vp,uint64_t * const vm,const uint32_t mmShift)255 static inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
256   uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
257   *vp = mulShift64(4 * m + 2, mul, j);
258   *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
259   return mulShift64(4 * m, mul, j);
260 }
261 
262 #else // !defined(HAS_UINT128) && !defined(HAS_64_BIT_INTRINSICS)
263 
mulShift64(const uint64_t m,const uint64_t * const mul,const int32_t j)264 static inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
265   // m is maximum 55 bits
266   uint64_t high1;                                   // 128
267   const uint64_t low1 = umul128(m, mul[1], &high1); // 64
268   uint64_t high0;                                   // 64
269   umul128(m, mul[0], &high0);                       // 0
270   const uint64_t sum = high0 + low1;
271   if (sum < high0) {
272     ++high1; // overflow into high1
273   }
274   return shiftright128(sum, high1, j - 64);
275 }
276 
277 // This is faster if we don't have a 64x64->128-bit multiplication.
mulShiftAll64(uint64_t m,const uint64_t * const mul,const int32_t j,uint64_t * const vp,uint64_t * const vm,const uint32_t mmShift)278 static inline uint64_t mulShiftAll64(uint64_t m, const uint64_t* const mul, const int32_t j,
279   uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
280   m <<= 1;
281   // m is maximum 55 bits
282   uint64_t tmp;
283   const uint64_t lo = umul128(m, mul[0], &tmp);
284   uint64_t hi;
285   const uint64_t mid = tmp + umul128(m, mul[1], &hi);
286   hi += mid < tmp; // overflow into hi
287 
288   const uint64_t lo2 = lo + mul[0];
289   const uint64_t mid2 = mid + mul[1] + (lo2 < lo);
290   const uint64_t hi2 = hi + (mid2 < mid);
291   *vp = shiftright128(mid2, hi2, (uint32_t) (j - 64 - 1));
292 
293   if (mmShift == 1) {
294     const uint64_t lo3 = lo - mul[0];
295     const uint64_t mid3 = mid - mul[1] - (lo3 > lo);
296     const uint64_t hi3 = hi - (mid3 > mid);
297     *vm = shiftright128(mid3, hi3, (uint32_t) (j - 64 - 1));
298   } else {
299     const uint64_t lo3 = lo + lo;
300     const uint64_t mid3 = mid + mid + (lo3 < lo);
301     const uint64_t hi3 = hi + hi + (mid3 < mid);
302     const uint64_t lo4 = lo3 - mul[0];
303     const uint64_t mid4 = mid3 - mul[1] - (lo4 > lo3);
304     const uint64_t hi4 = hi3 - (mid4 > mid3);
305     *vm = shiftright128(mid4, hi4, (uint32_t) (j - 64));
306   }
307 
308   return shiftright128(mid, hi, (uint32_t) (j - 64 - 1));
309 }
310 
311 #endif // HAS_64_BIT_INTRINSICS
312 
313 #endif // RYU_D2S_INTRINSICS_H
314