1 /*
2  * SPDX-FileCopyrightText: Copyright 2010-2020, 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_nn_depthwise_conv_nt_t_padded_s8.c
22  * Description:  Depthwise convolution with padded matrices.
23  *
24  * $Date:        26 October 2022
25  * $Revision:    V.2.0.1
26  *
27  * Target Processor:  Cortex-M processors with MVE extension
28  * -------------------------------------------------------------------- */
29 
30 #include "arm_nnsupportfunctions.h"
31 
32 /**
33  * @ingroup groupSupport
34  */
35 
36 /**
37  * @defgroup supportConvolution Convolution
38  *
39  * Support functions for Convolution and DW Convolution
40  *
41  */
42 
43 /**
44  * @addtogroup supportConvolution
45  * @{
46  */
47 
48 /*
49  * Depthwise convolution of transposed rhs matrix with 4 lhs matrices. One or more of the rhs matrices are padded.
50  * Dimensions are the same for lhs and rhs.
51  *
52  * Refer header file for details.
53  *
54  */
55 
arm_nn_depthwise_conv_nt_t_padded_s8(const int8_t * lhs,const int8_t * rhs,const int32_t input_offset,const int32_t active_ch,const int32_t total_ch,const int32_t * out_shift,const int32_t * out_mult,const int32_t out_offset,const int32_t activation_min,const int32_t activation_max,const uint16_t row_x_col,const int32_t * const output_bias,int8_t * out)56 arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8(const int8_t *lhs,
57                                                          const int8_t *rhs,
58                                                          const int32_t input_offset,
59                                                          const int32_t active_ch,
60                                                          const int32_t total_ch,
61                                                          const int32_t *out_shift,
62                                                          const int32_t *out_mult,
63                                                          const int32_t out_offset,
64                                                          const int32_t activation_min,
65                                                          const int32_t activation_max,
66                                                          const uint16_t row_x_col,
67                                                          const int32_t *const output_bias,
68                                                          int8_t *out)
69 {
70 #if defined(ARM_MATH_MVEI)
71     int32_t loop_count = (active_ch + 3) / 4;
72     const int32_t *bias = output_bias;
73     uint32_t num_ch_to_process = active_ch;
74 
75     for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
76          num_ch_to_process -= 4, out += 4, offset += 4, i_loop_cnt++)
77     {
78         int32x4_t out_0 = vdupq_n_s32(0);
79         if (bias)
80         {
81             out_0 = vldrwq_s32(bias);
82             bias += 4;
83         }
84         int32x4_t out_1 = out_0;
85         int32x4_t out_2 = out_0;
86         int32x4_t out_3 = out_0;
87 
88         const int8_t *rhs_0 = rhs + offset;
89         const int8_t *lhs_0 = lhs + offset;
90         const int8_t *lhs_1 = lhs + row_x_col * CH_IN_BLOCK_MVE + offset;
91         const int8_t *lhs_2 = lhs + (row_x_col * CH_IN_BLOCK_MVE * 2) + offset;
92         const int8_t *lhs_3 = lhs + (row_x_col * CH_IN_BLOCK_MVE * 3) + offset;
93 
94         for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
95         {
96             const int32x4_t ker_0 = vldrbq_s32(rhs_0);
97 
98             int32x4_t ip_0 = vldrbq_s32(lhs_0);
99             ip_0 = vaddq_n_s32(ip_0, input_offset);
100             out_0 += vmulq_s32(ip_0, ker_0);
101 
102             int32x4_t ip_1 = vldrbq_s32(lhs_1);
103             ip_1 = vaddq_n_s32(ip_1, input_offset);
104             out_1 += vmulq_s32(ip_1, ker_0);
105 
106             int32x4_t ip_2 = vldrbq_s32(lhs_2);
107             ip_2 = vaddq_n_s32(ip_2, input_offset);
108             out_2 += vmulq_s32(ip_2, ker_0);
109 
110             int32x4_t ip_3 = vldrbq_s32(lhs_3);
111             ip_3 = vaddq_n_s32(ip_3, input_offset);
112 
113             out_3 += vmulq_s32(ip_3, ker_0);
114 
115             lhs_0 += CH_IN_BLOCK_MVE;
116             lhs_1 += CH_IN_BLOCK_MVE;
117             lhs_2 += CH_IN_BLOCK_MVE;
118             lhs_3 += CH_IN_BLOCK_MVE;
119 
120             rhs_0 += total_ch;
121         }
122 
123         const int32x4_t mult = vldrwq_s32(out_mult);
124         const int32x4_t shift = vldrwq_s32(out_shift);
125         out_mult += 4;
126         out_shift += 4;
127 
128         out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
129         out_0 = vaddq_n_s32(out_0, out_offset);
130         out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
131         out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
132         mve_pred16_t p = vctp32q(num_ch_to_process);
133         vstrbq_p_s32(out, out_0, p);
134 
135         out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
136         out_1 = vaddq_n_s32(out_1, out_offset);
137         out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
138         out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
139         vstrbq_p_s32(out + total_ch, out_1, p);
140 
141         out_2 = arm_requantize_mve_32x4(out_2, mult, shift);
142         out_2 = vaddq_n_s32(out_2, out_offset);
143         out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min));
144         out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max));
145         vstrbq_p_s32(out + 2 * total_ch, out_2, p);
146 
147         out_3 = arm_requantize_mve_32x4(out_3, mult, shift);
148         out_3 = vaddq_n_s32(out_3, out_offset);
149         out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min));
150         out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max));
151         vstrbq_p_s32(out + 3 * total_ch, out_3, p);
152     }
153 
154     return ARM_CMSIS_NN_SUCCESS;
155 
156 #else
157     (void)lhs;
158     (void)rhs;
159     (void)input_offset;
160     (void)active_ch;
161     (void)total_ch;
162     (void)out_shift;
163     (void)out_mult;
164     (void)out_offset;
165     (void)activation_min;
166     (void)activation_max;
167     (void)row_x_col;
168     (void)output_bias;
169     (void)out;
170     return ARM_CMSIS_NN_NO_IMPL_ERROR;
171 #endif
172 }
173 
174 /**
175  * @} end of Doxygen group
176  */
177