1 /*
2 * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 /* ----------------------------------------------------------------------
20 * Project: CMSIS NN Library
21 * Title: arm_convolve_1_x_n_s8.c
22 * Description: s8 version of 1xN convolution using symmetric quantization.
23 *
24 * $Date: January 26, 2021
25 * $Revision: V.2.0.3
26 *
27 * Target Processor: Cortex-M cores
28 *
29 * -------------------------------------------------------------------- */
30
31 #include "arm_nnfunctions.h"
32 #include "arm_nnsupportfunctions.h"
33
34 /**
35 * @ingroup groupNN
36 */
37
38 /**
39 * @addtogroup NNConv
40 * @{
41 */
42
43 /*
44 * 1xN s8 convolution function.
45 *
46 * Refer header file for details.
47 *
48 */
49
arm_convolve_1_x_n_s8(const cmsis_nn_context * ctx,const cmsis_nn_conv_params * conv_params,const cmsis_nn_per_channel_quant_params * quant_params,const cmsis_nn_dims * input_dims,const q7_t * input_data,const cmsis_nn_dims * filter_dims,const q7_t * filter_data,const cmsis_nn_dims * bias_dims,const int32_t * bias_data,const cmsis_nn_dims * output_dims,q7_t * output_data)50 arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
51 const cmsis_nn_conv_params *conv_params,
52 const cmsis_nn_per_channel_quant_params *quant_params,
53 const cmsis_nn_dims *input_dims,
54 const q7_t *input_data,
55 const cmsis_nn_dims *filter_dims,
56 const q7_t *filter_data,
57 const cmsis_nn_dims *bias_dims,
58 const int32_t *bias_data,
59 const cmsis_nn_dims *output_dims,
60 q7_t *output_data)
61 {
62 (void)bias_dims;
63 arm_status status = ARM_MATH_SUCCESS;
64 if (output_dims->w % 4 != 0)
65 {
66 status = ARM_MATH_SIZE_MISMATCH;
67 goto out;
68 }
69
70 #if defined(ARM_MATH_MVEI)
71 (void)ctx;
72
73 const uint16_t input_x = input_dims->w;
74 const uint16_t kernel_x = filter_dims->w;
75 const uint16_t output_x = output_dims->w;
76 const uint16_t output_ch = output_dims->c;
77 const uint16_t input_ch = input_dims->c;
78 const uint16_t pad_x = conv_params->padding.w;
79 const uint16_t stride_x = conv_params->stride.w;
80
81 const int32_t input_offset = conv_params->input_offset;
82 const int32_t out_offset = conv_params->output_offset;
83 const int32_t out_activation_min = conv_params->activation.min;
84 const int32_t out_activation_max = conv_params->activation.max;
85 int32_t *output_mult = quant_params->multiplier;
86 int32_t *output_shift = quant_params->shift;
87
88 for (int i_out_x = 0; i_out_x <= (output_x - 4); i_out_x += 4)
89 {
90 int32_t input_begin_idx[4];
91 int32_t ker_begin_idx[4];
92 int32_t ker_end_idx[4];
93
94 for (int i = 0; i < 4; i++)
95 {
96 const int32_t est_input_x_idx = stride_x * (i_out_x + i) - pad_x;
97 input_begin_idx[i] = MAX(0, est_input_x_idx);
98 ker_begin_idx[i] = MAX(0, -est_input_x_idx);
99 ker_end_idx[i] = MIN(kernel_x, input_x - est_input_x_idx);
100 }
101
102 for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
103 {
104 int32x4_t s_offset;
105 int32_t acc[4];
106 if ((ker_begin_idx[0] != 0) || (ker_end_idx[3] != kernel_x))
107 {
108 int32_t sum_row[4];
109
110 (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[0] - ker_begin_idx[0]) * input_ch,
111 input_data + input_begin_idx[0] * input_ch,
112 filter_data + (input_ch * kernel_x * i_out_ch) +
113 (ker_begin_idx[0] * input_ch),
114 &sum_row[0],
115 &acc[0]);
116 (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[1] - ker_begin_idx[1]) * input_ch,
117 input_data + input_begin_idx[1] * input_ch,
118 filter_data + (input_ch * kernel_x * i_out_ch) +
119 (ker_begin_idx[1] * input_ch),
120 &sum_row[1],
121 &acc[1]);
122
123 (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[2] - ker_begin_idx[2]) * input_ch,
124 input_data + input_begin_idx[2] * input_ch,
125 filter_data + (input_ch * kernel_x * i_out_ch) +
126 (ker_begin_idx[2] * input_ch),
127 &sum_row[2],
128 &acc[2]);
129
130 (void)arm_nn_mat_mul_core_1x_s8((ker_end_idx[3] - ker_begin_idx[3]) * input_ch,
131 input_data + input_begin_idx[3] * input_ch,
132 filter_data + (input_ch * kernel_x * i_out_ch) +
133 (ker_begin_idx[3] * input_ch),
134 &sum_row[3],
135 &acc[3]);
136
137 s_offset = vldrwq_s32(sum_row);
138 }
139 else
140 {
141 int32_t sum_row;
142 (void)arm_nn_mat_mul_core_4x_s8(kernel_x * input_ch,
143 stride_x * input_ch,
144 input_data + input_begin_idx[0] * input_ch,
145 filter_data + (input_ch * kernel_x * i_out_ch),
146 &sum_row,
147 acc);
148
149 s_offset = vdupq_n_s32(sum_row);
150 }
151 int32x4_t res = vldrwq_s32(acc);
152 s_offset = vmulq_n_s32(s_offset, input_offset);
153 res = vaddq_s32(res, s_offset);
154 if (bias_data)
155 {
156 res = vaddq_n_s32(res, bias_data[i_out_ch]);
157 }
158 res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
159 res = vaddq_n_s32(res, out_offset);
160
161 res = vmaxq_s32(res, vdupq_n_s32(out_activation_min));
162 res = vminq_s32(res, vdupq_n_s32(out_activation_max));
163
164 const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
165 vstrbq_scatter_offset_s32(output_data, scatter_offset, res);
166 output_data++;
167 }
168 output_data += (3 * output_ch);
169 }
170
171 #else
172 status = arm_convolve_s8(ctx,
173 conv_params,
174 quant_params,
175 input_dims,
176 input_data,
177 filter_dims,
178 filter_data,
179 bias_dims,
180 bias_data,
181 output_dims,
182 output_data);
183 #endif
184
185 out:
186 /* Return to application */
187 return status;
188 }
189
arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims * input_dims,const cmsis_nn_dims * filter_dims)190 int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
191 {
192 #if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
193 return (2 * input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
194 #else
195 (void)input_dims;
196 (void)filter_dims;
197 return 0;
198 #endif
199 }
200
201 /**
202 * @} end of NNConv group
203 */
204