1 /*
2  * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_convolve_1_x_n_s8.c
22  * Description:  s8 version of 1xN convolution using symmetric quantization.
23  *
24  * $Date:        January 26, 2021
25  * $Revision:    V.2.0.3
26  *
27  * Target Processor:  Cortex-M cores
28  *
29  * -------------------------------------------------------------------- */
30 
31 #include "arm_nnfunctions.h"
32 #include "arm_nnsupportfunctions.h"
33 
34 /**
35  *  @ingroup groupNN
36  */
37 
38 /**
39  * @addtogroup NNConv
40  * @{
41  */
42 
43 /*
44  * 1xN s8 convolution function.
45  *
46  * Refer header file for details.
47  *
48  */
49 
arm_convolve_1_x_n_s8(const cmsis_nn_context * ctx,const cmsis_nn_conv_params * conv_params,const cmsis_nn_per_channel_quant_params * quant_params,const cmsis_nn_dims * input_dims,const q7_t * input_data,const cmsis_nn_dims * filter_dims,const q7_t * filter_data,const cmsis_nn_dims * bias_dims,const int32_t * bias_data,const cmsis_nn_dims * output_dims,q7_t * output_data)50 arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
51                                  const cmsis_nn_conv_params *conv_params,
52                                  const cmsis_nn_per_channel_quant_params *quant_params,
53                                  const cmsis_nn_dims *input_dims,
54                                  const q7_t *input_data,
55                                  const cmsis_nn_dims *filter_dims,
56                                  const q7_t *filter_data,
57                                  const cmsis_nn_dims *bias_dims,
58                                  const int32_t *bias_data,
59                                  const cmsis_nn_dims *output_dims,
60                                  q7_t *output_data)
61 {
62     (void)bias_dims;
63     arm_status status = ARM_MATH_SUCCESS;
64     if (output_dims->w % 4 != 0)
65     {
66         status = ARM_MATH_SIZE_MISMATCH;
67         goto out;
68     }
69 
70 #if defined(ARM_MATH_MVEI)
71     (void)ctx;
72 
73     const uint16_t input_x = input_dims->w;
74     const uint16_t kernel_x = filter_dims->w;
75     const uint16_t output_x = output_dims->w;
76     const uint16_t output_ch = output_dims->c;
77     const uint16_t input_ch = input_dims->c;
78     const uint16_t pad_x = conv_params->padding.w;
79     const uint16_t stride_x = conv_params->stride.w;
80 
81     const int32_t input_offset = conv_params->input_offset;
82     const int32_t out_offset = conv_params->output_offset;
83     const int32_t out_activation_min = conv_params->activation.min;
84     const int32_t out_activation_max = conv_params->activation.max;
85     int32_t *output_mult = quant_params->multiplier;
86     int32_t *output_shift = quant_params->shift;
87 
88     for (int i_out_x = 0; i_out_x <= (output_x - 4); i_out_x += 4)
89     {
90         int32_t input_begin_idx[4];
91         int32_t ker_begin_idx[4];
92         int32_t ker_end_idx[4];
93 
94         for (int i = 0; i < 4; i++)
95         {
96             const int32_t est_input_x_idx = stride_x * (i_out_x + i) - pad_x;
97             input_begin_idx[i] = MAX(0, est_input_x_idx);
98             ker_begin_idx[i] = MAX(0, -est_input_x_idx);
99             ker_end_idx[i] = MIN(kernel_x, input_x - est_input_x_idx);
100         }
101 
102         for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
103         {
104             int32x4_t s_offset;
105             int32_t acc[4];
106             if ((ker_begin_idx[0] != 0) || (ker_end_idx[3] != kernel_x))
107             {
108                 int32_t sum_row[4];
109 
110                 (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[0] - ker_begin_idx[0]) * input_ch,
111                                                 input_data + input_begin_idx[0] * input_ch,
112                                                 filter_data + (input_ch * kernel_x * i_out_ch) +
113                                                     (ker_begin_idx[0] * input_ch),
114                                                 &sum_row[0],
115                                                 &acc[0]);
116                 (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[1] - ker_begin_idx[1]) * input_ch,
117                                                 input_data + input_begin_idx[1] * input_ch,
118                                                 filter_data + (input_ch * kernel_x * i_out_ch) +
119                                                     (ker_begin_idx[1] * input_ch),
120                                                 &sum_row[1],
121                                                 &acc[1]);
122 
123                 (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[2] - ker_begin_idx[2]) * input_ch,
124                                                 input_data + input_begin_idx[2] * input_ch,
125                                                 filter_data + (input_ch * kernel_x * i_out_ch) +
126                                                     (ker_begin_idx[2] * input_ch),
127                                                 &sum_row[2],
128                                                 &acc[2]);
129 
130                 (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[3] - ker_begin_idx[3]) * input_ch,
131                                                 input_data + input_begin_idx[3] * input_ch,
132                                                 filter_data + (input_ch * kernel_x * i_out_ch) +
133                                                     (ker_begin_idx[3] * input_ch),
134                                                 &sum_row[3],
135                                                 &acc[3]);
136 
137                 s_offset = vldrwq_s32(sum_row);
138             }
139             else
140             {
141                 int32_t sum_row;
142                 (void)arm_nn_mat_mul_core_4x_s8(kernel_x * input_ch,
143                                                 stride_x * input_ch,
144                                                 input_data + input_begin_idx[0] * input_ch,
145                                                 filter_data + (input_ch * kernel_x * i_out_ch),
146                                                 &sum_row,
147                                                 acc);
148 
149                 s_offset = vdupq_n_s32(sum_row);
150             }
151             int32x4_t res = vldrwq_s32(acc);
152             s_offset = vmulq_n_s32(s_offset, input_offset);
153             res = vaddq_s32(res, s_offset);
154             if (bias_data)
155             {
156                 res = vaddq_n_s32(res, bias_data[i_out_ch]);
157             }
158             res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
159             res = vaddq_n_s32(res, out_offset);
160 
161             res = vmaxq_s32(res, vdupq_n_s32(out_activation_min));
162             res = vminq_s32(res, vdupq_n_s32(out_activation_max));
163 
164             const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
165             vstrbq_scatter_offset_s32(output_data, scatter_offset, res);
166             output_data++;
167         }
168         output_data += (3 * output_ch);
169     }
170 
171 #else
172     status = arm_convolve_s8(ctx,
173                              conv_params,
174                              quant_params,
175                              input_dims,
176                              input_data,
177                              filter_dims,
178                              filter_data,
179                              bias_dims,
180                              bias_data,
181                              output_dims,
182                              output_data);
183 #endif
184 
185 out:
186     /* Return to application */
187     return status;
188 }
189 
arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims * input_dims,const cmsis_nn_dims * filter_dims)190 int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
191 {
192 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
193     return (2 * input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
194 #else
195     (void)input_dims;
196     (void)filter_dims;
197     return 0;
198 #endif
199 }
200 
201 /**
202  * @} end of NNConv group
203  */
204