1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
17 
18 #include "tensorflow/lite/kernels/internal/common.h"
19 
20 namespace tflite {
21 
22 namespace reference_ops {
23 
24 // Element-wise mul that can often be used for inner loop of broadcast Mul as
25 // well as the non-broadcast Mul.
MulElementwise(int size,const ArithmeticParams & params,const uint8_t * input1_data,const uint8_t * input2_data,uint8_t * output_data)26 inline void MulElementwise(int size, const ArithmeticParams& params,
27                            const uint8_t* input1_data,
28                            const uint8_t* input2_data, uint8_t* output_data) {
29   for (int i = 0; i < size; ++i) {
30     const int32_t input1_val = params.input1_offset + input1_data[i];
31     const int32_t input2_val = params.input2_offset + input2_data[i];
32     const int32_t unclamped_result =
33         params.output_offset +
34         MultiplyByQuantizedMultiplier(input1_val * input2_val,
35                                       params.output_multiplier,
36                                       params.output_shift);
37     const int32_t clamped_output =
38         std::min(params.quantized_activation_max,
39                  std::max(params.quantized_activation_min, unclamped_result));
40     output_data[i] = static_cast<uint8_t>(clamped_output);
41   }
42 }
43 
44 template <typename T>
Mul(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)45 inline void Mul(const ArithmeticParams& params,
46                 const RuntimeShape& input1_shape, const T* input1_data,
47                 const RuntimeShape& input2_shape, const T* input2_data,
48                 const RuntimeShape& output_shape, T* output_data) {
49   T output_activation_min;
50   T output_activation_max;
51   GetActivationParams(params, &output_activation_min, &output_activation_max);
52 
53   const int flat_size =
54       MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
55   for (int i = 0; i < flat_size; ++i) {
56     output_data[i] = ActivationFunctionWithMinMax(
57         input1_data[i] * input2_data[i], output_activation_min,
58         output_activation_max);
59   }
60 }
61 
Mul(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)62 inline void Mul(const ArithmeticParams& params,
63                 const RuntimeShape& input1_shape, const uint8_t* input1_data,
64                 const RuntimeShape& input2_shape, const uint8_t* input2_data,
65                 const RuntimeShape& output_shape, uint8_t* output_data) {
66   TFLITE_DCHECK_LE(params.quantized_activation_min,
67                    params.quantized_activation_max);
68   const int flat_size =
69       MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
70 
71   MulElementwise(flat_size, params, input1_data, input2_data, output_data);
72 }
73 
BroadcastMul4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)74 inline void BroadcastMul4DSlow(const ArithmeticParams& params,
75                                const RuntimeShape& input1_shape,
76                                const uint8_t* input1_data,
77                                const RuntimeShape& input2_shape,
78                                const uint8_t* input2_data,
79                                const RuntimeShape& output_shape,
80                                uint8_t* output_data) {
81   NdArrayDesc<4> desc1;
82   NdArrayDesc<4> desc2;
83   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
84                                       &desc2);
85   const RuntimeShape extended_output_shape =
86       RuntimeShape::ExtendedShape(4, output_shape);
87 
88   for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
89     for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
90       for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
91         for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
92           const int32_t input1_val =
93               params.input1_offset +
94               input1_data[SubscriptToIndex(desc1, b, y, x, c)];
95           const int32_t input2_val =
96               params.input2_offset +
97               input2_data[SubscriptToIndex(desc2, b, y, x, c)];
98           const int32_t unclamped_result =
99               params.output_offset +
100               MultiplyByQuantizedMultiplier(input1_val * input2_val,
101                                             params.output_multiplier,
102                                             params.output_shift);
103           const int32_t clamped_output = std::min(
104               params.quantized_activation_max,
105               std::max(params.quantized_activation_min, unclamped_result));
106           output_data[Offset(extended_output_shape, b, y, x, c)] =
107               static_cast<uint8_t>(clamped_output);
108         }
109       }
110     }
111   }
112 }
113 
114 template <typename T>
BroadcastMul4DSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)115 void BroadcastMul4DSlow(const ArithmeticParams& params,
116                         const RuntimeShape& unextended_input1_shape,
117                         const T* input1_data,
118                         const RuntimeShape& unextended_input2_shape,
119                         const T* input2_data,
120                         const RuntimeShape& unextended_output_shape,
121                         T* output_data) {
122   T output_activation_min;
123   T output_activation_max;
124   GetActivationParams(params, &output_activation_min, &output_activation_max);
125 
126   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
127   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
128   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
129   const RuntimeShape output_shape =
130       RuntimeShape::ExtendedShape(4, unextended_output_shape);
131 
132   NdArrayDesc<4> desc1;
133   NdArrayDesc<4> desc2;
134   NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
135                                       unextended_input2_shape, &desc1, &desc2);
136 
137   // In Tensorflow, the dimensions are canonically named (batch_number, row,
138   // col, channel), with extents (batches, height, width, depth), with the
139   // trailing dimension changing most rapidly (channels has the smallest stride,
140   // typically 1 element).
141   //
142   // In generated C code, we store arrays with the dimensions reversed. The
143   // first dimension has smallest stride.
144   //
145   // We name our variables by their Tensorflow convention, but generate C code
146   // nesting loops such that the innermost loop has the smallest stride for the
147   // best cache behavior.
148   for (int b = 0; b < output_shape.Dims(0); ++b) {
149     for (int y = 0; y < output_shape.Dims(1); ++y) {
150       for (int x = 0; x < output_shape.Dims(2); ++x) {
151         for (int c = 0; c < output_shape.Dims(3); ++c) {
152           output_data[Offset(output_shape, b, y, x, c)] =
153               ActivationFunctionWithMinMax(
154                   input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
155                       input2_data[SubscriptToIndex(desc2, b, y, x, c)],
156                   output_activation_min, output_activation_max);
157         }
158       }
159     }
160   }
161 }
162 
163 }  // namespace reference_ops
164 }  // namespace tflite
165 
166 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
167