1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
17
18 #include "tensorflow/lite/kernels/internal/common.h"
19
20 namespace tflite {
21
22 namespace reference_ops {
23
24 // Element-wise mul that can often be used for inner loop of broadcast Mul as
25 // well as the non-broadcast Mul.
MulElementwise(int size,const ArithmeticParams & params,const uint8_t * input1_data,const uint8_t * input2_data,uint8_t * output_data)26 inline void MulElementwise(int size, const ArithmeticParams& params,
27 const uint8_t* input1_data,
28 const uint8_t* input2_data, uint8_t* output_data) {
29 for (int i = 0; i < size; ++i) {
30 const int32_t input1_val = params.input1_offset + input1_data[i];
31 const int32_t input2_val = params.input2_offset + input2_data[i];
32 const int32_t unclamped_result =
33 params.output_offset +
34 MultiplyByQuantizedMultiplier(input1_val * input2_val,
35 params.output_multiplier,
36 params.output_shift);
37 const int32_t clamped_output =
38 std::min(params.quantized_activation_max,
39 std::max(params.quantized_activation_min, unclamped_result));
40 output_data[i] = static_cast<uint8_t>(clamped_output);
41 }
42 }
43
44 template <typename T>
Mul(const ArithmeticParams & params,const RuntimeShape & input1_shape,const T * input1_data,const RuntimeShape & input2_shape,const T * input2_data,const RuntimeShape & output_shape,T * output_data)45 inline void Mul(const ArithmeticParams& params,
46 const RuntimeShape& input1_shape, const T* input1_data,
47 const RuntimeShape& input2_shape, const T* input2_data,
48 const RuntimeShape& output_shape, T* output_data) {
49 T output_activation_min;
50 T output_activation_max;
51 GetActivationParams(params, &output_activation_min, &output_activation_max);
52
53 const int flat_size =
54 MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
55 for (int i = 0; i < flat_size; ++i) {
56 output_data[i] = ActivationFunctionWithMinMax(
57 input1_data[i] * input2_data[i], output_activation_min,
58 output_activation_max);
59 }
60 }
61
Mul(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)62 inline void Mul(const ArithmeticParams& params,
63 const RuntimeShape& input1_shape, const uint8_t* input1_data,
64 const RuntimeShape& input2_shape, const uint8_t* input2_data,
65 const RuntimeShape& output_shape, uint8_t* output_data) {
66 TFLITE_DCHECK_LE(params.quantized_activation_min,
67 params.quantized_activation_max);
68 const int flat_size =
69 MatchingExtendedShapeFlatSize(input1_shape, input2_shape, output_shape);
70
71 MulElementwise(flat_size, params, input1_data, input2_data, output_data);
72 }
73
BroadcastMul4DSlow(const ArithmeticParams & params,const RuntimeShape & input1_shape,const uint8_t * input1_data,const RuntimeShape & input2_shape,const uint8_t * input2_data,const RuntimeShape & output_shape,uint8_t * output_data)74 inline void BroadcastMul4DSlow(const ArithmeticParams& params,
75 const RuntimeShape& input1_shape,
76 const uint8_t* input1_data,
77 const RuntimeShape& input2_shape,
78 const uint8_t* input2_data,
79 const RuntimeShape& output_shape,
80 uint8_t* output_data) {
81 NdArrayDesc<4> desc1;
82 NdArrayDesc<4> desc2;
83 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
84 &desc2);
85 const RuntimeShape extended_output_shape =
86 RuntimeShape::ExtendedShape(4, output_shape);
87
88 for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
89 for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
90 for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
91 for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
92 const int32_t input1_val =
93 params.input1_offset +
94 input1_data[SubscriptToIndex(desc1, b, y, x, c)];
95 const int32_t input2_val =
96 params.input2_offset +
97 input2_data[SubscriptToIndex(desc2, b, y, x, c)];
98 const int32_t unclamped_result =
99 params.output_offset +
100 MultiplyByQuantizedMultiplier(input1_val * input2_val,
101 params.output_multiplier,
102 params.output_shift);
103 const int32_t clamped_output = std::min(
104 params.quantized_activation_max,
105 std::max(params.quantized_activation_min, unclamped_result));
106 output_data[Offset(extended_output_shape, b, y, x, c)] =
107 static_cast<uint8_t>(clamped_output);
108 }
109 }
110 }
111 }
112 }
113
114 template <typename T>
BroadcastMul4DSlow(const ArithmeticParams & params,const RuntimeShape & unextended_input1_shape,const T * input1_data,const RuntimeShape & unextended_input2_shape,const T * input2_data,const RuntimeShape & unextended_output_shape,T * output_data)115 void BroadcastMul4DSlow(const ArithmeticParams& params,
116 const RuntimeShape& unextended_input1_shape,
117 const T* input1_data,
118 const RuntimeShape& unextended_input2_shape,
119 const T* input2_data,
120 const RuntimeShape& unextended_output_shape,
121 T* output_data) {
122 T output_activation_min;
123 T output_activation_max;
124 GetActivationParams(params, &output_activation_min, &output_activation_max);
125
126 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
127 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
128 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
129 const RuntimeShape output_shape =
130 RuntimeShape::ExtendedShape(4, unextended_output_shape);
131
132 NdArrayDesc<4> desc1;
133 NdArrayDesc<4> desc2;
134 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape,
135 unextended_input2_shape, &desc1, &desc2);
136
137 // In Tensorflow, the dimensions are canonically named (batch_number, row,
138 // col, channel), with extents (batches, height, width, depth), with the
139 // trailing dimension changing most rapidly (channels has the smallest stride,
140 // typically 1 element).
141 //
142 // In generated C code, we store arrays with the dimensions reversed. The
143 // first dimension has smallest stride.
144 //
145 // We name our variables by their Tensorflow convention, but generate C code
146 // nesting loops such that the innermost loop has the smallest stride for the
147 // best cache behavior.
148 for (int b = 0; b < output_shape.Dims(0); ++b) {
149 for (int y = 0; y < output_shape.Dims(1); ++y) {
150 for (int x = 0; x < output_shape.Dims(2); ++x) {
151 for (int c = 0; c < output_shape.Dims(3); ++c) {
152 output_data[Offset(output_shape, b, y, x, c)] =
153 ActivationFunctionWithMinMax(
154 input1_data[SubscriptToIndex(desc1, b, y, x, c)] *
155 input2_data[SubscriptToIndex(desc2, b, y, x, c)],
156 output_activation_min, output_activation_max);
157 }
158 }
159 }
160 }
161 }
162
163 } // namespace reference_ops
164 } // namespace tflite
165
166 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_MUL_H_
167