1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/kernels/internal/quantization_util.h"
17 
18 #include <algorithm>
19 #include <cmath>
20 #include <limits>
21 
22 #include "tensorflow/lite/kernels/internal/compatibility.h"
23 #include "tensorflow/lite/kernels/internal/cppmath.h"
24 
25 namespace tflite {
26 
27 namespace {
28 // These constants are used to manipulate the binary representation of doubles.
29 // Double-precision binary64 floating point format is:
30 // Bit |  63  |  62-52   |   51-0   |
31 //     | Sign | Exponent | Fraction |
32 // To avoid 64-bit integers as much as possible, I break this into high and
33 // low 32-bit chunks. High is:
34 // Bit |  31  |  30-20   |      19-0     |
35 //     | Sign | Exponent | High Fraction |
36 // Low is:
37 // Bit |     31-0     |
38 //     | Low Fraction |
39 // We then access the components through logical bit-wise operations to
40 // extract the parts needed, with the positions and masks derived from the
41 // layout shown above.
42 constexpr uint64_t kSignMask = 0x8000000000000000LL;
43 constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
44 constexpr int32_t kExponentShift = 52;
45 constexpr int32_t kExponentBias = 1023;
46 constexpr uint32_t kExponentIsBadNum = 0x7ff;
47 constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
48 constexpr uint32_t kFractionShift = 22;
49 constexpr uint32_t kFractionRoundingMask = 0x003fffff;
50 constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
51 }  // namespace
52 
QuantizeMultiplier(double double_multiplier,int32_t * quantized_multiplier,int * shift)53 void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
54                         int* shift) {
55   if (double_multiplier == 0.) {
56     *quantized_multiplier = 0;
57     *shift = 0;
58     return;
59   }
60 #ifdef TFLITE_EMULATE_FLOAT
61   // If we're trying to avoid the use of floating-point instructions (for
62   // example on microcontrollers) then use an alternative implementation
63   // that only requires integer and bitwise operations. To enable this, you
64   // need to set the define during the build process for your platform.
65   int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
66 #else   // TFLITE_EMULATE_FLOAT
67   const double q = std::frexp(double_multiplier, shift);
68   auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
69 #endif  // TFLITE_EMULATE_FLOAT
70   TFLITE_CHECK(q_fixed <= (1ll << 31));
71   if (q_fixed == (1ll << 31)) {
72     q_fixed /= 2;
73     ++*shift;
74   }
75   TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
76   // A shift amount smaller than -31 would cause all bits to be shifted out
77   // and thus all results would be zero. We implement that instead with
78   // q_fixed==0, so as to avoid hitting issues with right-shift
79   // operations with shift amounts greater than 31. Note that this happens
80   // roughly when abs(double_multiplier) < 2^-31 and the present handling means
81   // that we're effectively flushing tiny double_multiplier's to zero.
82   // We could conceivably handle values in the range (roughly) [32, 63]
83   // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
84   // the present handling is just doing 'flush denormals to zero'. We could
85   // reconsider and actually generate nonzero denormals if a need arises.
86   if (*shift < -31) {
87     *shift = 0;
88     q_fixed = 0;
89   }
90   *quantized_multiplier = static_cast<int32_t>(q_fixed);
91 }
92 
QuantizeMultiplierGreaterThanOne(double double_multiplier,int32_t * quantized_multiplier,int * left_shift)93 void QuantizeMultiplierGreaterThanOne(double double_multiplier,
94                                       int32_t* quantized_multiplier,
95                                       int* left_shift) {
96   TFLITE_CHECK_GT(double_multiplier, 1.);
97   QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
98   TFLITE_CHECK_GE(*left_shift, 0);
99 }
100 
QuantizeMultiplierSmallerThanOneExp(double double_multiplier,int32_t * quantized_multiplier,int * left_shift)101 void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
102                                          int32_t* quantized_multiplier,
103                                          int* left_shift) {
104   TFLITE_CHECK_LT(double_multiplier, 1.);
105   TFLITE_CHECK_GT(double_multiplier, 0.);
106   int shift;
107   QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
108   TFLITE_CHECK_LE(shift, 0);
109   *left_shift = shift;
110 }
111 
IntegerFrExp(double input,int * shift)112 int64_t IntegerFrExp(double input, int* shift) {
113   // Make sure our assumptions about the double layout hold.
114   TFLITE_CHECK_EQ(8, sizeof(double));
115 
116   // We want to access the bits of the input double value directly, which is
117   // tricky to do safely, so use a union to handle the casting.
118   union {
119     double double_value;
120     uint64_t double_as_uint;
121   } cast_union;
122   cast_union.double_value = input;
123   const uint64_t u = cast_union.double_as_uint;
124 
125   // If the bitfield is all zeros apart from the sign bit, this is a normalized
126   // zero value, so return standard values for this special case.
127   if ((u & ~kSignMask) == 0) {
128     *shift = 0;
129     return 0;
130   }
131 
132   // Deal with NaNs and Infs, which are always indicated with a fixed pattern in
133   // the exponent, and distinguished by whether the fractions are zero or
134   // non-zero.
135   const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
136   if (exponent_part == kExponentIsBadNum) {
137     *shift = std::numeric_limits<int>::max();
138     if (u & kFractionMask) {
139       // NaN, so just return zero (with the exponent set to INT_MAX).
140       return 0;
141     } else {
142       // Infinity, so return +/- INT_MAX.
143       if (u & kSignMask) {
144         return std::numeric_limits<int64_t>::min();
145       } else {
146         return std::numeric_limits<int64_t>::max();
147       }
148     }
149   }
150 
151   // The shift is fairly easy to extract from the high bits of the double value,
152   // just by masking it out and applying a bias. The std::frexp() implementation
153   // always returns values between 0.5 and 1.0 though, whereas the exponent
154   // assumes 1.0 to 2.0 is the standard range, so I add on one to match that
155   // interface.
156   *shift = (exponent_part - kExponentBias) + 1;
157 
158   // There's an implicit high bit in the double format definition, so make sure
159   // we include that at the top, and then reconstruct the rest of the fractional
160   // value from the remaining fragments.
161   int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
162 
163   // We're cutting off some bits at the bottom, so to exactly match the standard
164   // frexp implementation here we'll apply rounding by adding one to the least
165   // significant bit of the result if the discarded portion is over half of the
166   // maximum.
167   if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
168     fraction += 1;
169   }
170   // Negate the fraction if the sign bit was set.
171   if (u & kSignMask) {
172     fraction *= -1;
173   }
174 
175   return fraction;
176 }
177 
DoubleFromFractionAndShift(int64_t fraction,int shift)178 double DoubleFromFractionAndShift(int64_t fraction, int shift) {
179   union {
180     double double_value;
181     uint64_t double_as_uint;
182   } result;
183 
184   // Detect NaNs and infinities.
185   if (shift == std::numeric_limits<int>::max()) {
186     if (fraction == 0) {
187       return std::numeric_limits<double>::quiet_NaN();
188     } else if (fraction > 0) {
189       return std::numeric_limits<double>::infinity();
190     } else {
191       return -std::numeric_limits<double>::infinity();
192     }
193   }
194 
195   // Return a normalized zero for a zero fraction.
196   if (fraction == 0) {
197     result.double_as_uint = 0;
198     return result.double_value;
199   }
200 
201   bool is_negative = (fraction < 0);
202   int64_t encoded_fraction = is_negative ? -fraction : fraction;
203   int64_t encoded_shift = (shift - 1);
204   while (encoded_fraction < 0x40000000) {
205     encoded_fraction *= 2;
206     encoded_shift -= 1;
207   }
208   while (encoded_fraction > 0x80000000) {
209     encoded_fraction /= 2;
210     encoded_shift += 1;
211   }
212   encoded_fraction -= 0x40000000;
213   if (encoded_shift < -1022) {
214     encoded_shift = -1023;
215   } else if (encoded_shift > 1022) {
216     encoded_shift = 1023;
217   }
218   encoded_shift += kExponentBias;
219   uint64_t encoded_sign = is_negative ? kSignMask : 0;
220   result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
221                           (encoded_fraction << kFractionShift);
222   return result.double_value;
223 }
224 
IntegerDoubleMultiply(double a,double b)225 double IntegerDoubleMultiply(double a, double b) {
226   int a_shift;
227   const int64_t a_fraction = IntegerFrExp(a, &a_shift);
228   int b_shift;
229   const int64_t b_fraction = IntegerFrExp(b, &b_shift);
230   // Detect NaNs and infinities.
231   if (a_shift == std::numeric_limits<int>::max() ||
232       (b_shift == std::numeric_limits<int>::max())) {
233     return std::numeric_limits<double>::quiet_NaN();
234   }
235   const int result_shift = a_shift + b_shift + 1;
236   const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
237   return DoubleFromFractionAndShift(result_fraction, result_shift);
238 }
239 
IntegerDoubleCompare(double a,double b)240 int IntegerDoubleCompare(double a, double b) {
241   int a_shift;
242   const int64_t a_fraction = IntegerFrExp(a, &a_shift);
243   int b_shift;
244   const int64_t b_fraction = IntegerFrExp(b, &b_shift);
245 
246   // Detect NaNs and infinities.
247   if (a_shift == std::numeric_limits<int>::max() ||
248       (b_shift == std::numeric_limits<int>::max())) {
249     return 1;
250   }
251 
252   if ((a_fraction == 0) && (b_fraction < 0)) {
253     return 1;
254   } else if ((a_fraction < 0) && (b_fraction == 0)) {
255     return -1;
256   } else if (a_shift < b_shift) {
257     return -1;
258   } else if (a_shift > b_shift) {
259     return 1;
260   } else if (a_fraction < b_fraction) {
261     return -1;
262   } else if (a_fraction > b_fraction) {
263     return 1;
264   } else {
265     return 0;
266   }
267 }
268 
PreprocessSoftmaxScaling(double beta,double input_scale,int input_integer_bits,int32_t * quantized_multiplier,int * left_shift)269 void PreprocessSoftmaxScaling(double beta, double input_scale,
270                               int input_integer_bits,
271                               int32_t* quantized_multiplier, int* left_shift) {
272   // If the overall multiplier (input and beta) is large, then exp() of an
273   // input difference of 1 scaled by this will be large.  In other words, we
274   // can cap the multiplier and know that, when it is used, the output will be
275   // (round to) zero wherever the input is not at the maximum value.
276 
277   // If the overall scale is less than one, and input_integer_bits=0, then the
278   // result is double equivalent of Q0.31 (actually with more precision). Thus
279   // this generates a Q(input_integer_bits).(31-input_integer_bits)
280   // representation.
281 #ifdef TFLITE_EMULATE_FLOAT
282   const double input_beta = IntegerDoubleMultiply(beta, input_scale);
283   int shift;
284   int64_t fraction = IntegerFrExp(input_beta, &shift);
285   shift += (31 - input_integer_bits);
286   double input_beta_real_multiplier =
287       DoubleFromFractionAndShift(fraction, shift);
288   if (IntegerDoubleCompare(input_beta_real_multiplier, (1ll << 31) - 1.0) > 0) {
289     input_beta_real_multiplier = (1ll << 31) - 1.0;
290   }
291 #else   // TFLITE_EMULATE_FLOAT
292   const double input_beta_real_multiplier = std::min<double>(
293       beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
294 #endif  // TFLITE_EMULATE_FLOAT
295 
296   QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
297                                    quantized_multiplier, left_shift);
298 }
299 
PreprocessLogSoftmaxScalingExp(double beta,double input_scale,int input_integer_bits,int32_t * quantized_multiplier,int * left_shift,int32_t * reverse_scaling_divisor,int * reverse_scaling_left_shift)300 void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
301                                     int input_integer_bits,
302                                     int32_t* quantized_multiplier,
303                                     int* left_shift,
304                                     int32_t* reverse_scaling_divisor,
305                                     int* reverse_scaling_left_shift) {
306   PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
307                            quantized_multiplier, left_shift);
308 
309   // Also calculate what amounts to the inverse scaling factor for the input.
310   const double real_reverse_scaling_divisor =
311       (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
312   tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
313                                               reverse_scaling_divisor,
314                                               reverse_scaling_left_shift);
315 }
316 
CalculateInputRadius(int input_integer_bits,int input_left_shift,int total_signed_bits)317 int CalculateInputRadius(int input_integer_bits, int input_left_shift,
318                          int total_signed_bits) {
319 #ifdef TFLITE_EMULATE_FLOAT
320   int64_t result = (1 << input_integer_bits) - 1;
321   result <<= (total_signed_bits - input_integer_bits);
322   result >>= input_left_shift;
323   return result;
324 #else   // TFLITE_EMULATE_FLOAT
325   const double max_input_rescaled =
326       1.0 * ((1 << input_integer_bits) - 1) *
327       (1ll << (total_signed_bits - input_integer_bits)) /
328       (1ll << input_left_shift);
329   // Tighten bound using floor.  Suppose that we could use the exact value.
330   // After scaling the difference, the result would be at the maximum.  Thus we
331   // must ensure that our value has lower magnitude.
332   return static_cast<int>(std::floor(max_input_rescaled));
333 #endif  // TFLITE_EMULATE_FLOAT
334 }
335 
NudgeQuantizationRange(const float min,const float max,const int quant_min,const int quant_max,float * nudged_min,float * nudged_max,float * nudged_scale)336 void NudgeQuantizationRange(const float min, const float max,
337                             const int quant_min, const int quant_max,
338                             float* nudged_min, float* nudged_max,
339                             float* nudged_scale) {
340   // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
341   const float quant_min_float = static_cast<float>(quant_min);
342   const float quant_max_float = static_cast<float>(quant_max);
343   *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
344   const float zero_point_from_min = quant_min_float - min / *nudged_scale;
345   uint16_t nudged_zero_point;
346   if (zero_point_from_min < quant_min_float) {
347     nudged_zero_point = static_cast<uint16_t>(quant_min);
348   } else if (zero_point_from_min > quant_max_float) {
349     nudged_zero_point = static_cast<uint16_t>(quant_max);
350   } else {
351     nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
352   }
353   *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
354   *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
355 }
356 
FakeQuantizeArray(const float nudged_scale,const float nudged_min,const float nudged_max,const float * input_data,float * output_data,const float size)357 void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
358                        const float nudged_max, const float* input_data,
359                        float* output_data, const float size) {
360   // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
361   const float inv_nudged_scale = 1.0f / nudged_scale;
362 
363   for (int i = 0; i < size; i++) {
364     const float src_val = input_data[i];
365     const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
366     const float clamped_shifted = clamped - nudged_min;
367     const float dst_val =
368         TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
369         nudged_min;
370     output_data[i] = dst_val;
371   }
372 }
373 
CheckedLog2(const float x,int * log2_result)374 bool CheckedLog2(const float x, int* log2_result) {
375   // Using TfLiteRound instead of std::round and std::log instead of
376   // std::log2 to work around these functions being missing in a toolchain
377   // used in some TensorFlow tests as of May 2018.
378   const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
379   const float x_log2_rounded = TfLiteRound(x_log2);
380   const float x_log2_fracpart = x_log2 - x_log2_rounded;
381 
382   *log2_result = static_cast<int>(x_log2_rounded);
383   return std::abs(x_log2_fracpart) < 1e-3f;
384 }
385 
QuantizeMultiplierArray(const double * effective_scales,size_t size,int32_t * effective_scale_significand,int * effective_shift)386 void QuantizeMultiplierArray(const double* effective_scales, size_t size,
387                              int32_t* effective_scale_significand,
388                              int* effective_shift) {
389   for (size_t i = 0; i < size; ++i) {
390     QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
391                        &effective_shift[i]);
392   }
393 }
394 
395 }  // namespace tflite
396