1 /*
2  * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_depthwise_conv_s8.c
22  * Description:  s8 version of depthwise convolution.
23  *
24  * $Date:        11. May 2021
25  * $Revision:    V.2.5.0
26  *
27  * Target Processor:  Cortex-M CPUs
28  *
29  * -------------------------------------------------------------------- */
30 
31 #include "arm_nnfunctions.h"
32 #include "arm_nnsupportfunctions.h"
33 
34 /**
35  *  @ingroup groupNN
36  */
37 
38 /**
39  * @addtogroup NNConv
40  * @{
41  */
42 
depthwise_conv_s8_mult_4(const int8_t * input,const int32_t input_x,const int32_t input_y,const int32_t input_ch,const int8_t * kernel,const int32_t output_ch,const int32_t ch_mult,const int32_t kernel_x,const int32_t kernel_y,const int32_t pad_x,const int32_t pad_y,const int32_t stride_x,const int32_t stride_y,const int32_t * bias,int8_t * output,const int32_t * output_shift,const int32_t * output_mult,const int32_t output_x,const int32_t output_y,const int32_t output_offset,const int32_t input_offset,const int32_t output_activation_min,const int32_t output_activation_max)43 static void depthwise_conv_s8_mult_4(const int8_t *input,
44                                      const int32_t input_x,
45                                      const int32_t input_y,
46                                      const int32_t input_ch,
47                                      const int8_t *kernel,
48                                      const int32_t output_ch,
49                                      const int32_t ch_mult,
50                                      const int32_t kernel_x,
51                                      const int32_t kernel_y,
52                                      const int32_t pad_x,
53                                      const int32_t pad_y,
54                                      const int32_t stride_x,
55                                      const int32_t stride_y,
56                                      const int32_t *bias,
57                                      int8_t *output,
58                                      const int32_t *output_shift,
59                                      const int32_t *output_mult,
60                                      const int32_t output_x,
61                                      const int32_t output_y,
62                                      const int32_t output_offset,
63                                      const int32_t input_offset,
64                                      const int32_t output_activation_min,
65                                      const int32_t output_activation_max)
66 {
67     for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
68     {
69         for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
70         {
71             for (int32_t in_ch = 0, out_ch = 0, ker_w_start = MAX(0, -in_w); out_ch < output_ch;
72                  ++in_ch, out_ch += ch_mult)
73             {
74                 for (int mult_tile = 0; mult_tile < ch_mult; mult_tile += 4)
75                 {
76                     int32_t out_buff[4];
77 
78                     out_buff[0] = bias[out_ch + 0 + mult_tile];
79                     out_buff[1] = bias[out_ch + 1 + mult_tile];
80                     out_buff[2] = bias[out_ch + 2 + mult_tile];
81                     out_buff[3] = bias[out_ch + 3 + mult_tile];
82 
83                     for (int32_t ker_h = ker_h_start; ker_h < MIN(kernel_y, input_y - in_h); ++ker_h)
84                     {
85                         int32_t ker_idx = ker_h * (output_ch * kernel_x) + ker_w_start * output_ch + out_ch;
86                         int32_t in_idx = (in_h + ker_h) * (input_ch * input_x) + in_w * input_ch + in_ch;
87 
88                         for (int32_t ker_w = ker_w_start; ker_w < MIN(kernel_x, input_x - in_w);
89                              ++ker_w, ker_idx += output_ch)
90                         {
91                             int32_t in_val = input[in_idx + ker_w * input_ch] + input_offset;
92                             out_buff[0] += in_val * kernel[ker_idx + 0 + mult_tile];
93                             out_buff[1] += in_val * kernel[ker_idx + 1 + mult_tile];
94                             out_buff[2] += in_val * kernel[ker_idx + 2 + mult_tile];
95                             out_buff[3] += in_val * kernel[ker_idx + 3 + mult_tile];
96                         }
97                     }
98 #if defined(ARM_MATH_MVEI)
99                     (void)out_idx;
100                     int32x4_t res = vldrwq_s32(out_buff);
101                     res = arm_requantize_mve_32x4(res,
102                                                   vldrwq_s32(&output_mult[out_ch + mult_tile]),
103                                                   vldrwq_s32(&output_shift[out_ch + mult_tile]));
104                     res = vaddq_n_s32(res, output_offset);
105 
106                     res = vmaxq_s32(res, vdupq_n_s32(output_activation_min));
107                     res = vminq_s32(res, vdupq_n_s32(output_activation_max));
108                     vstrbq_s32(output, res);
109                     output += 4;
110 #else
111                     out_buff[0] = arm_nn_requantize(
112                         out_buff[0], output_mult[out_ch + 0 + mult_tile], output_shift[out_ch + 0 + mult_tile]);
113                     out_buff[1] = arm_nn_requantize(
114                         out_buff[1], output_mult[out_ch + 1 + mult_tile], output_shift[out_ch + 1 + mult_tile]);
115                     out_buff[2] = arm_nn_requantize(
116                         out_buff[2], output_mult[out_ch + 2 + mult_tile], output_shift[out_ch + 2 + mult_tile]);
117                     out_buff[3] = arm_nn_requantize(
118                         out_buff[3], output_mult[out_ch + 3 + mult_tile], output_shift[out_ch + 3 + mult_tile]);
119 
120                     out_buff[0] += output_offset;
121                     out_buff[1] += output_offset;
122                     out_buff[2] += output_offset;
123                     out_buff[3] += output_offset;
124 
125                     out_buff[0] = MIN(MAX(out_buff[0], output_activation_min), output_activation_max);
126                     out_buff[1] = MIN(MAX(out_buff[1], output_activation_min), output_activation_max);
127                     out_buff[2] = MIN(MAX(out_buff[2], output_activation_min), output_activation_max);
128                     out_buff[3] = MIN(MAX(out_buff[3], output_activation_min), output_activation_max);
129 
130                     output[out_idx++] = (int8_t)out_buff[0];
131                     output[out_idx++] = (int8_t)out_buff[1];
132                     output[out_idx++] = (int8_t)out_buff[2];
133                     output[out_idx++] = (int8_t)out_buff[3];
134 
135 #endif
136                 }
137             }
138         }
139     }
140 }
141 
depthwise_conv_s8_generic(const q7_t * input,const uint16_t input_batches,const uint16_t input_x,const uint16_t input_y,const uint16_t input_ch,const q7_t * kernel,const uint16_t output_ch,const uint16_t ch_mult,const uint16_t kernel_x,const uint16_t kernel_y,const uint16_t pad_x,const uint16_t pad_y,const uint16_t stride_x,const uint16_t stride_y,const int32_t * bias,q7_t * output,const int32_t * output_shift,const int32_t * output_mult,const uint16_t output_x,const uint16_t output_y,const int32_t output_offset,const int32_t input_offset,const int32_t output_activation_min,const int32_t output_activation_max)142 static void depthwise_conv_s8_generic(const q7_t *input,
143                                       const uint16_t input_batches,
144                                       const uint16_t input_x,
145                                       const uint16_t input_y,
146                                       const uint16_t input_ch,
147                                       const q7_t *kernel,
148                                       const uint16_t output_ch,
149                                       const uint16_t ch_mult,
150                                       const uint16_t kernel_x,
151                                       const uint16_t kernel_y,
152                                       const uint16_t pad_x,
153                                       const uint16_t pad_y,
154                                       const uint16_t stride_x,
155                                       const uint16_t stride_y,
156                                       const int32_t *bias,
157                                       q7_t *output,
158                                       const int32_t *output_shift,
159                                       const int32_t *output_mult,
160                                       const uint16_t output_x,
161                                       const uint16_t output_y,
162                                       const int32_t output_offset,
163                                       const int32_t input_offset,
164                                       const int32_t output_activation_min,
165                                       const int32_t output_activation_max)
166 {
167     (void)output_ch;
168     int i_out = 0;
169     int i_batch;
170 
171     for (i_batch = 0; i_batch < input_batches; i_batch++)
172     {
173         for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
174         {
175             const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
176             for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
177             {
178                 const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
179                 for (int i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
180                 {
181                     for (int i_ch_mult = 0; i_ch_mult < ch_mult; i_ch_mult++)
182                     {
183                         const int idx_out_ch = i_ch_mult + i_input_ch * ch_mult;
184                         int32_t acc_0;
185                         /* Condition for kernel start dimension: (base_idx_<x,y> + ker_<x,y>_start) >= 0 */
186                         const int ker_y_start = MAX(0, -base_idx_y);
187                         const int ker_x_start = MAX(0, -base_idx_x);
188                         /* Condition for kernel end dimension: (base_idx_<x,y> + ker_<x,y>_end) < input_<x,y> */
189                         const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
190                         const int ker_x_end = MIN(kernel_x, input_x - base_idx_x);
191                         acc_0 = bias[idx_out_ch];
192 
193                         for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
194                         {
195                             const int32_t idx_y = base_idx_y + i_ker_y;
196                             for (int i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
197                             {
198                                 const int32_t idx_x = base_idx_x + i_ker_x;
199                                 int32_t idx_0 = (idx_y * input_x + idx_x) * input_ch + i_input_ch;
200                                 int32_t ker_idx_0 = (i_ker_y * kernel_x + i_ker_x) * (input_ch * ch_mult) + idx_out_ch;
201 
202                                 acc_0 += (input[idx_0] + input_offset) * kernel[ker_idx_0];
203                             }
204                         }
205 
206                         /* Requantize and clamp output to provided range */
207                         acc_0 = arm_nn_requantize(acc_0, output_mult[idx_out_ch], output_shift[idx_out_ch]);
208                         acc_0 += output_offset;
209                         acc_0 = MAX(acc_0, output_activation_min);
210                         acc_0 = MIN(acc_0, output_activation_max);
211 
212                         output[i_out++] = acc_0;
213                     }
214                 }
215             }
216         }
217         /* Advance to the next batch */
218         input += (input_x * input_y * input_ch);
219     }
220 }
221 
222 /*
223  *  Basic s8 depthwise convolution function.
224  *
225  *  Refer header file for details.
226  *  Optimization using DSP extension is not available for the generic case where channel multiplier is > 1.
227  *
228  */
arm_depthwise_conv_s8(const cmsis_nn_context * ctx,const cmsis_nn_dw_conv_params * dw_conv_params,const cmsis_nn_per_channel_quant_params * quant_params,const cmsis_nn_dims * input_dims,const q7_t * input,const cmsis_nn_dims * filter_dims,const q7_t * kernel,const cmsis_nn_dims * bias_dims,const int32_t * bias,const cmsis_nn_dims * output_dims,q7_t * output)229 arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
230                                  const cmsis_nn_dw_conv_params *dw_conv_params,
231                                  const cmsis_nn_per_channel_quant_params *quant_params,
232                                  const cmsis_nn_dims *input_dims,
233                                  const q7_t *input,
234                                  const cmsis_nn_dims *filter_dims,
235                                  const q7_t *kernel,
236                                  const cmsis_nn_dims *bias_dims,
237                                  const int32_t *bias,
238                                  const cmsis_nn_dims *output_dims,
239                                  q7_t *output)
240 {
241     (void)dw_conv_params->dilation;
242     (void)bias_dims;
243     (void)ctx;
244 
245     if (dw_conv_params->ch_mult % 4 == 0 && input_dims->n == 1)
246     {
247         depthwise_conv_s8_mult_4(input,
248                                  input_dims->w,
249                                  input_dims->h,
250                                  input_dims->c,
251                                  kernel,
252                                  output_dims->c,
253                                  dw_conv_params->ch_mult,
254                                  filter_dims->w,
255                                  filter_dims->h,
256                                  dw_conv_params->padding.w,
257                                  dw_conv_params->padding.h,
258                                  dw_conv_params->stride.w,
259                                  dw_conv_params->stride.h,
260                                  bias,
261                                  output,
262                                  quant_params->shift,
263                                  quant_params->multiplier,
264                                  output_dims->w,
265                                  output_dims->h,
266                                  dw_conv_params->output_offset,
267                                  dw_conv_params->input_offset,
268                                  dw_conv_params->activation.min,
269                                  dw_conv_params->activation.max);
270     }
271     else
272     {
273         depthwise_conv_s8_generic(input,
274                                   input_dims->n,
275                                   input_dims->w,
276                                   input_dims->h,
277                                   input_dims->c,
278                                   kernel,
279                                   output_dims->c,
280                                   dw_conv_params->ch_mult,
281                                   filter_dims->w,
282                                   filter_dims->h,
283                                   dw_conv_params->padding.w,
284                                   dw_conv_params->padding.h,
285                                   dw_conv_params->stride.w,
286                                   dw_conv_params->stride.h,
287                                   bias,
288                                   output,
289                                   quant_params->shift,
290                                   quant_params->multiplier,
291                                   output_dims->w,
292                                   output_dims->h,
293                                   dw_conv_params->output_offset,
294                                   dw_conv_params->input_offset,
295                                   dw_conv_params->activation.min,
296                                   dw_conv_params->activation.max);
297     }
298 
299     /* Return to application */
300     return ARM_MATH_SUCCESS;
301 }
302 
303 /**
304  * @} end of NNConv group
305  */
306