1 /*
2  * SPDX-FileCopyrightText: Copyright 2010-2020, 2022, 2024 Arm Limited and/or its affiliates
3  * <open-source-office@arm.com>
4  *
5  * SPDX-License-Identifier: Apache-2.0
6  *
7  * Licensed under the Apache License, Version 2.0 (the License); you may
8  * not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
15  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 
20 /* ----------------------------------------------------------------------
21  * Project:      CMSIS NN Library
22  * Title:        arm_nn_depthwise_conv_nt_t_s4.c
23  * Description:  Depthwise convolution on matrices with no padding and packed int4 weights.
24  *
25  * $Date:        05 April 2024
26  * $Revision:    V.1.0.0
27  *
28  * Target Processor:  Cortex-M processors with MVE extension.
29  * -------------------------------------------------------------------- */
30 
31 #include "arm_nnsupportfunctions.h"
32 
33 /**
34  * @ingroup groupSupport
35  */
36 
37 /**
38  * @addtogroup supportConvolution
39  * @{
40  */
41 
42 /*
43  * Depthwise convolution of rhs matrix with 4 lhs matrices with no padding and packed int4 weights.
44  * Dimensions are the same for lhs and rhs.
45  *
46  * Refer header file for details.
47  *
48  */
arm_nn_depthwise_conv_nt_t_s4(const int8_t * lhs,const int8_t * rhs,const int32_t input_offset,const int32_t active_ch,const int32_t total_ch,const int32_t * out_shift,const int32_t * out_mult,const int32_t out_offset,const int32_t activation_min,const int32_t activation_max,const uint16_t row_x_col,const int32_t * const output_bias,int8_t * out)49 arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s4(const int8_t *lhs,
50                                                   const int8_t *rhs,
51                                                   const int32_t input_offset,
52                                                   const int32_t active_ch,
53                                                   const int32_t total_ch,
54                                                   const int32_t *out_shift,
55                                                   const int32_t *out_mult,
56                                                   const int32_t out_offset,
57                                                   const int32_t activation_min,
58                                                   const int32_t activation_max,
59                                                   const uint16_t row_x_col,
60                                                   const int32_t *const output_bias,
61                                                   int8_t *out)
62 {
63 #if defined(ARM_MATH_MVEI)
64     const int32_t *bias = output_bias;
65     int32_t loop_count = (active_ch + 3) / 4;
66     uint32_t num_ch_to_process = active_ch;
67     const uint32x4_t gather_offset = {0, 0, 1, 1};
68     const mve_pred16_t lower_nibble_mask = 3855; // 0000111100001111
69 
70     for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
71          num_ch_to_process -= 4, offset += 4, out += 4, i_loop_cnt++)
72     {
73         int32x4_t out_0 = vdupq_n_s32(0);
74         if (bias)
75         {
76             out_0 = vldrwq_s32(bias);
77             bias += 4;
78         }
79         int32x4_t out_1 = out_0;
80         int32x4_t out_2 = out_0;
81         int32x4_t out_3 = out_0;
82 
83         const int8_t *rhs_0 = rhs + (offset >> 1);
84         const int8_t *lhs_0 = lhs + offset;
85         const int8_t *lhs_1 = lhs + row_x_col * S4_CH_IN_BLOCK_MVE + offset;
86         const int8_t *lhs_2 = lhs + (row_x_col * S4_CH_IN_BLOCK_MVE * 2) + offset;
87         const int8_t *lhs_3 = lhs + (row_x_col * S4_CH_IN_BLOCK_MVE * 3) + offset;
88         int32x4_t ker_sum = vdupq_n_s32(0);
89 
90         if (total_ch % 2)
91         {
92             int get_low_nibble = 1;
93             for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
94             {
95                 int32x4_t ker_0;
96                 if (get_low_nibble)
97                 {
98                     ker_0 = vldrbq_gather_offset_s32(rhs_0, gather_offset);
99 
100                     ker_0 = vrshlq_m_n_s32(ker_0, 28, lower_nibble_mask);
101                     ker_0 = vshrq_m_n_s32(ker_0, ker_0, 24, lower_nibble_mask);
102 
103                     ker_0 = vshrq_n_s32(ker_0, 4);
104                 }
105                 else
106                 {
107                     int8_t temp[] = {
108                         rhs_0[0] >> 4, (int8_t)(rhs_0[1] << 4) >> 4, rhs_0[1] >> 4, (int8_t)(rhs_0[2] << 4) >> 4};
109                     ker_0 = vldrbq_s32(temp);
110                 }
111 
112                 ker_sum = vaddq_s32(ker_sum, ker_0);
113 
114                 int32x4_t ip_0 = vldrbq_s32(lhs_0);
115                 out_0 += vmulq_s32(ip_0, ker_0);
116 
117                 int32x4_t ip_1 = vldrbq_s32(lhs_1);
118                 out_1 += vmulq_s32(ip_1, ker_0);
119 
120                 int32x4_t ip_2 = vldrbq_s32(lhs_2);
121                 out_2 += vmulq_s32(ip_2, ker_0);
122 
123                 int32x4_t ip_3 = vldrbq_s32(lhs_3);
124                 out_3 += vmulq_s32(ip_3, ker_0);
125 
126                 lhs_0 += S4_CH_IN_BLOCK_MVE;
127                 lhs_1 += S4_CH_IN_BLOCK_MVE;
128                 lhs_2 += S4_CH_IN_BLOCK_MVE;
129                 lhs_3 += S4_CH_IN_BLOCK_MVE;
130 
131                 get_low_nibble = !get_low_nibble;
132                 rhs_0 += (total_ch >> 1) + get_low_nibble;
133             }
134         }
135         else
136         {
137             for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
138             {
139                 int32x4_t ker_0 = vldrbq_gather_offset_s32(rhs_0, gather_offset);
140 
141                 ker_0 = vrshlq_m_n_s32(ker_0, 28, lower_nibble_mask);
142                 ker_0 = vshrq_m_n_s32(ker_0, ker_0, 24, lower_nibble_mask);
143 
144                 ker_0 = vshrq_n_s32(ker_0, 4);
145 
146                 ker_sum = vaddq_s32(ker_sum, ker_0);
147 
148                 int32x4_t ip_0 = vldrbq_s32(lhs_0);
149                 out_0 += vmulq_s32(ip_0, ker_0);
150 
151                 int32x4_t ip_1 = vldrbq_s32(lhs_1);
152                 out_1 += vmulq_s32(ip_1, ker_0);
153 
154                 int32x4_t ip_2 = vldrbq_s32(lhs_2);
155                 out_2 += vmulq_s32(ip_2, ker_0);
156 
157                 int32x4_t ip_3 = vldrbq_s32(lhs_3);
158                 out_3 += vmulq_s32(ip_3, ker_0);
159 
160                 lhs_0 += S4_CH_IN_BLOCK_MVE;
161                 lhs_1 += S4_CH_IN_BLOCK_MVE;
162                 lhs_2 += S4_CH_IN_BLOCK_MVE;
163                 lhs_3 += S4_CH_IN_BLOCK_MVE;
164 
165                 rhs_0 += total_ch >> 1;
166             }
167         }
168 
169         ker_sum = vmulq_n_s32(ker_sum, input_offset);
170         out_0 = ker_sum + out_0;
171         out_1 = ker_sum + out_1;
172         out_2 = ker_sum + out_2;
173         out_3 = ker_sum + out_3;
174 
175         const int32x4_t mult = vldrwq_s32(out_mult);
176         const int32x4_t shift = vldrwq_s32(out_shift);
177         out_mult += 4;
178         out_shift += 4;
179         mve_pred16_t p = vctp32q(num_ch_to_process);
180 
181         out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
182         out_0 = vaddq_n_s32(out_0, out_offset);
183         out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
184         out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
185         vstrbq_p_s32(out, out_0, p);
186 
187         out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
188         out_1 = vaddq_n_s32(out_1, out_offset);
189         out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
190         out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
191         vstrbq_p_s32(out + total_ch, out_1, p);
192 
193         out_2 = arm_requantize_mve_32x4(out_2, mult, shift);
194         out_2 = vaddq_n_s32(out_2, out_offset);
195         out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min));
196         out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max));
197         vstrbq_p_s32(out + 2 * total_ch, out_2, p);
198 
199         out_3 = arm_requantize_mve_32x4(out_3, mult, shift);
200         out_3 = vaddq_n_s32(out_3, out_offset);
201         out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min));
202         out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max));
203         vstrbq_p_s32(out + 3 * total_ch, out_3, p);
204     }
205 
206     return ARM_CMSIS_NN_SUCCESS;
207 #else
208     (void)lhs;
209     (void)rhs;
210     (void)input_offset;
211     (void)active_ch;
212     (void)total_ch;
213     (void)out_shift;
214     (void)out_mult;
215     (void)out_offset;
216     (void)activation_min;
217     (void)activation_max;
218     (void)row_x_col;
219     (void)output_bias;
220     (void)out;
221     return ARM_CMSIS_NN_NO_IMPL_ERROR;
222 #endif
223 }
224 
225 /**
226  * @} end of Doxygen group
227  */
228