1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
17 
18 #include <algorithm>
19 
20 #include "fixedpoint/fixedpoint.h"
21 #include "tensorflow/lite/kernels/internal/common.h"
22 #include "tensorflow/lite/kernels/internal/compatibility.h"
23 #include "tensorflow/lite/kernels/internal/types.h"
24 
25 namespace tflite {
26 
27 // Used in tests and template parameters to control which version of depthwise
28 // convolution is called. Primarily for reference code, and specializations
29 // forced in tests.
30 enum class DepthwiseConvImplementation {
31   // Run all tests against kUseStandardEntry even if also testing another
32   // kernel, since we need to be sure that the main DepthwiseConv() function in
33   // optimized_ops.h dispatches to a correctly-executing kernel.
34   kNone = 0,                 // The "default" option: use the normal
35                              // DepthwiseConv kernel (entry) function.
36   kUseGenericKernel,         // Forced use of generic kernel.
37   kUseNeon3x3,               // 3x3 kernel that uses NEON when available.
38   kUseNeon3x3DotProduct,     // 3x3 kernel that uses dot-product enabled NEON
39                              // when available.
40   kUseCModel3x3DotProduct,   // 3x3 kernel, reference C model that is intended
41                              // to match overall design NEON code.
42   kUseUnwound3x3DotProduct,  // 3x3 kernel, reference C model with unwound loops
43                              // and some arrays.
44   kUseIntrinsics3x3DotProduct,  // 3x3 kernel using NEON intrinsics.
45 };
46 
47 // Category of depthwise convolution output rounding.
48 enum class DepthwiseConvOutputRounding {
49   kNone = 0,      // Invalid: specific method must be specified.
50   kAwayFromZero,  // Original method: exact halves rounded away from zero.
51   kUpward,        // Halves towards +infinity: adds 0.5 before truncate.
52   // This is where a future kNearestEven would be placed.
53 };
54 
55 // Category of depthwise convolution depth multiplication.
56 enum class DepthwiseConvDepthMultiplication {
57   kNoMultiplication = 0,  // Depth multiplier = 1.
58   kUnitInputDepth,        // Input depth = 1, output depth = depth multiplier.
59 };
60 
61 namespace reference_ops {
62 namespace depthwise_conv {
63 
64 template <DepthwiseConvOutputRounding output_rounding>
DepthwiseConvRound(int32_t x,int32_t quantized_multiplier,int shift)65 inline int32_t DepthwiseConvRound(int32_t x, int32_t quantized_multiplier,
66                                   int shift) {
67   TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
68   return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
69 }
70 
71 template <>
72 inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
73     int32_t x, int32_t quantized_multiplier, int shift) {
74   return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
75 }
76 
77 template <>
78 inline int32_t DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
79     int32_t x, int32_t quantized_multiplier, int shift) {
80   using gemmlowp::SaturatingRoundingDoublingHighMul;
81   const int left_shift = shift > 0 ? shift : 0;
82   const int right_shift = shift > 0 ? 0 : -shift;
83   const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0;
84   return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
85                                             quantized_multiplier) +
86           rounding_offset) >>
87          right_shift;
88 }
89 
90 template <DepthwiseConvOutputRounding output_rounding>
91 struct DepthwiseConvBasicKernel {
RunDepthwiseConvBasicKernel92   static inline void Run(
93       const DepthwiseParams& params, const RuntimeShape& input_shape,
94       const uint8_t* input_data, const RuntimeShape& filter_shape,
95       const uint8_t* filter_data, const RuntimeShape& bias_shape,
96       const int32_t* bias_data, const RuntimeShape& output_shape,
97       uint8_t* output_data) {
98     const int stride_width = params.stride_width;
99     const int stride_height = params.stride_height;
100     const int dilation_width_factor = params.dilation_width_factor;
101     const int dilation_height_factor = params.dilation_height_factor;
102     const int pad_width = params.padding_values.width;
103     const int pad_height = params.padding_values.height;
104     const int depth_multiplier = params.depth_multiplier;
105     const int32_t output_activation_min = params.quantized_activation_min;
106     const int32_t output_activation_max = params.quantized_activation_max;
107     const int32_t input_offset = params.input_offset;
108     const int32_t filter_offset = params.weights_offset;
109     const int32_t output_offset = params.output_offset;
110     const int32_t output_multiplier = params.output_multiplier;
111     const int output_shift = params.output_shift;
112     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
113     TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
114     TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
115 
116     TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
117     const int batches = MatchingDim(input_shape, 0, output_shape, 0);
118     const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
119     const int input_height = input_shape.Dims(1);
120     const int input_width = input_shape.Dims(2);
121     const int input_depth = input_shape.Dims(3);
122     const int filter_height = filter_shape.Dims(1);
123     const int filter_width = filter_shape.Dims(2);
124     const int output_height = output_shape.Dims(1);
125     const int output_width = output_shape.Dims(2);
126     TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
127     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
128 
129     for (int b = 0; b < batches; ++b) {
130       for (int out_y = 0; out_y < output_height; ++out_y) {
131         for (int out_x = 0; out_x < output_width; ++out_x) {
132           for (int ic = 0; ic < input_depth; ++ic) {
133             for (int m = 0; m < depth_multiplier; m++) {
134               const int oc = m + ic * depth_multiplier;
135               const int in_x_origin = (out_x * stride_width) - pad_width;
136               const int in_y_origin = (out_y * stride_height) - pad_height;
137               int32_t acc = 0;
138               for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
139                 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
140                   const int in_x =
141                       in_x_origin + dilation_width_factor * filter_x;
142                   const int in_y =
143                       in_y_origin + dilation_height_factor * filter_y;
144                   // If the location is outside the bounds of the input image,
145                   // use zero as a default value.
146                   if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
147                       (in_y < input_height)) {
148                     int32_t input_val =
149                         input_data[Offset(input_shape, b, in_y, in_x, ic)];
150                     int32_t filter_val = filter_data[Offset(
151                         filter_shape, 0, filter_y, filter_x, oc)];
152                     acc += (filter_val + filter_offset) *
153                            (input_val + input_offset);
154                   }
155                 }
156               }
157               if (bias_data) {
158                 acc += bias_data[oc];
159               }
160               acc = DepthwiseConvRound<output_rounding>(acc, output_multiplier,
161                                                         output_shift);
162               acc += output_offset;
163               acc = std::max(acc, output_activation_min);
164               acc = std::min(acc, output_activation_max);
165               output_data[Offset(output_shape, b, out_y, out_x, oc)] =
166                   static_cast<uint8_t>(acc);
167             }
168           }
169         }
170       }
171     }
172   }
173 
174   // TODO(b/148596273): Reconcile reference versions, perhaps with common
175   // MultiplyByQuantizedMultiplier or DepthwiseConvRound function.
RunPerChannelDepthwiseConvBasicKernel176   static inline void RunPerChannel(
177       const DepthwiseParams& params, const RuntimeShape& input_shape,
178       const int8_t* input_data, const RuntimeShape& filter_shape,
179       const int8_t* filter_data, const RuntimeShape& bias_shape,
180       const int32_t* bias_data, const RuntimeShape& output_shape,
181       int8_t* output_data) {
182     // Get parameters.
183     // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
184     const int stride_width = params.stride_width;
185     const int stride_height = params.stride_height;
186     const int dilation_width_factor = params.dilation_width_factor;
187     const int dilation_height_factor = params.dilation_height_factor;
188     const int pad_width = params.padding_values.width;
189     const int pad_height = params.padding_values.height;
190     const int depth_multiplier = params.depth_multiplier;
191     const int32_t input_offset = params.input_offset;
192     const int32_t output_offset = params.output_offset;
193     const int32_t output_activation_min = params.quantized_activation_min;
194     const int32_t output_activation_max = params.quantized_activation_max;
195     const int32_t* output_multiplier = params.output_multiplier_per_channel;
196     const int32_t* output_shift = params.output_shift_per_channel;
197 
198     // Check dimensions of the tensors.
199     TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
200     TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
201     TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
202 
203     TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
204     const int batches = MatchingDim(input_shape, 0, output_shape, 0);
205     const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
206     const int input_height = input_shape.Dims(1);
207     const int input_width = input_shape.Dims(2);
208     const int input_depth = input_shape.Dims(3);
209     const int filter_height = filter_shape.Dims(1);
210     const int filter_width = filter_shape.Dims(2);
211     const int output_height = output_shape.Dims(1);
212     const int output_width = output_shape.Dims(2);
213     TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
214     TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
215 
216     for (int batch = 0; batch < batches; ++batch) {
217       for (int out_y = 0; out_y < output_height; ++out_y) {
218         for (int out_x = 0; out_x < output_width; ++out_x) {
219           for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
220             for (int m = 0; m < depth_multiplier; ++m) {
221               const int output_channel = m + in_channel * depth_multiplier;
222               const int in_x_origin = (out_x * stride_width) - pad_width;
223               const int in_y_origin = (out_y * stride_height) - pad_height;
224               int32_t acc = 0;
225               for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
226                 for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
227                   const int in_x =
228                       in_x_origin + dilation_width_factor * filter_x;
229                   const int in_y =
230                       in_y_origin + dilation_height_factor * filter_y;
231                   // Zero padding by omitting the areas outside the image.
232                   const bool is_point_inside_image =
233                       (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
234                       (in_y < input_height);
235                   if (is_point_inside_image) {
236                     int32_t input_val = input_data[Offset(
237                         input_shape, batch, in_y, in_x, in_channel)];
238                     int32_t filter_val = filter_data[Offset(
239                         filter_shape, 0, filter_y, filter_x, output_channel)];
240                     // Accumulate with 32 bits accumulator.
241                     // In the nudging process during model quantization, we
242                     // force real value of 0.0 be represented by a quantized
243                     // value. This guarantees that the input_offset is a int8_t,
244                     // even though it is represented using int32_t. int32_t +=
245                     // int8_t
246                     // * (int8_t - int8_t) so the highest value we can get from
247                     // each accumulation is [-127, 127] * ([-128, 127] -
248                     // [-128, 127]), which is [-32512, 32512]. log2(32512)
249                     // = 14.98, which means we can accumulate at least 2^16
250                     // multiplications without overflow. The accumulator is
251                     // applied to a filter so the accumulation logic will hold
252                     // as long as the filter size (filter_y * filter_x *
253                     // in_channel) does not exceed 2^16, which is the case in
254                     // all the models we have seen so far.
255                     acc += filter_val * (input_val + input_offset);
256                   }
257                 }
258               }
259               if (bias_data) {
260                 acc += bias_data[output_channel];
261               }
262               acc = DepthwiseConvRound<output_rounding>(
263                   acc, output_multiplier[output_channel],
264                   output_shift[output_channel]);
265               acc += output_offset;
266               acc = std::max(acc, output_activation_min);
267               acc = std::min(acc, output_activation_max);
268               output_data[Offset(output_shape, batch, out_y, out_x,
269                                  output_channel)] = static_cast<int8_t>(acc);
270             }
271           }
272         }
273       }
274     }
275   }
276 };
277 
278 }  // namespace depthwise_conv
279 
DepthwiseConv(const DepthwiseParams & params,const RuntimeShape & input_shape,const uint8_t * input_data,const RuntimeShape & filter_shape,const uint8_t * filter_data,const RuntimeShape & bias_shape,const int32_t * bias_data,const RuntimeShape & output_shape,uint8_t * output_data)280 inline void DepthwiseConv(
281     const DepthwiseParams& params, const RuntimeShape& input_shape,
282     const uint8_t* input_data, const RuntimeShape& filter_shape,
283     const uint8_t* filter_data, const RuntimeShape& bias_shape,
284     const int32_t* bias_data, const RuntimeShape& output_shape,
285     uint8_t* output_data) {
286   return depthwise_conv::DepthwiseConvBasicKernel<
287       DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
288                                                        input_data, filter_shape,
289                                                        filter_data, bias_shape,
290                                                        bias_data, output_shape,
291                                                        output_data);
292 }
293 
294 }  // namespace reference_ops
295 }  // end namespace tflite
296 
297 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
298