1 /*
2  * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_nn_depthwise_conv_nt_t_s8.c
22  * Description:  Depthwise convolution on matrices with no padding.
23  *
24  * $Date:        09. October 2020
25  * $Revision:    V.1.0.2
26  *
27  * Target Processor:  Cortex-M processors with MVE extension.
28  * -------------------------------------------------------------------- */
29 
30 #include "arm_nnsupportfunctions.h"
31 
32 /**
33  * @ingroup groupSupport
34  */
35 
36 /**
37  * @addtogroup NNBasicMath
38  * @{
39  */
40 
41 /*
42  * Depthwise convolution of rhs matrix with 4 lhs matrices with no padding. Dimensions are the same for lhs and rhs.
43  *
44  * Refer header file for details.
45  *
46  */
47 
arm_nn_depthwise_conv_nt_t_s8(const q7_t * lhs,const q7_t * rhs,const int32_t input_offset,const uint16_t num_ch,const int32_t * out_shift,const int32_t * out_mult,const int32_t out_offset,const int32_t activation_min,const int32_t activation_max,const uint16_t row_x_col,const int32_t * const output_bias,q7_t * out)48 q7_t *arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
49                                     const q7_t *rhs,
50                                     const int32_t input_offset,
51                                     const uint16_t num_ch,
52                                     const int32_t *out_shift,
53                                     const int32_t *out_mult,
54                                     const int32_t out_offset,
55                                     const int32_t activation_min,
56                                     const int32_t activation_max,
57                                     const uint16_t row_x_col,
58                                     const int32_t *const output_bias,
59                                     q7_t *out)
60 {
61 #if defined(ARM_MATH_MVEI)
62     const int32_t *bias = output_bias;
63     int32_t loop_count = (num_ch + 3) / 4;
64     uint32_t num_ch_to_process = num_ch;
65 
66     for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
67          num_ch_to_process -= 4, offset += 4, out += 4, i_loop_cnt++)
68     {
69         int32x4_t out_0 = vldrwq_s32(bias);
70         int32x4_t out_1 = out_0;
71         int32x4_t out_2 = out_0;
72         int32x4_t out_3 = out_0;
73         bias += 4;
74 
75         const int8_t *rhs_0 = rhs + offset;
76         const int8_t *lhs_0 = lhs + offset;
77         const int8_t *lhs_1 = lhs + row_x_col * num_ch + offset;
78         const int8_t *lhs_2 = lhs + (row_x_col * num_ch * 2) + offset;
79         const int8_t *lhs_3 = lhs + (row_x_col * num_ch * 3) + offset;
80         int32x4_t ker_sum = vdupq_n_s32(0);
81 
82         for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
83         {
84             const int32x4_t ker_0 = vldrbq_s32(rhs_0);
85             ker_sum = vaddq_s32(ker_sum, ker_0);
86 
87             int32x4_t ip_0 = vldrbq_s32(lhs_0);
88             out_0 += vmulq_s32(ip_0, ker_0);
89 
90             int32x4_t ip_1 = vldrbq_s32(lhs_1);
91             out_1 += vmulq_s32(ip_1, ker_0);
92 
93             int32x4_t ip_2 = vldrbq_s32(lhs_2);
94             out_2 += vmulq_s32(ip_2, ker_0);
95 
96             int32x4_t ip_3 = vldrbq_s32(lhs_3);
97             out_3 += vmulq_s32(ip_3, ker_0);
98 
99             lhs_0 += num_ch;
100             lhs_1 += num_ch;
101             lhs_2 += num_ch;
102             lhs_3 += num_ch;
103 
104             rhs_0 += num_ch;
105         }
106 
107         ker_sum = vmulq_n_s32(ker_sum, input_offset);
108         out_0 = ker_sum + out_0;
109         out_1 = ker_sum + out_1;
110         out_2 = ker_sum + out_2;
111         out_3 = ker_sum + out_3;
112 
113         const int32x4_t mult = vldrwq_s32(out_mult);
114         const int32x4_t shift = vldrwq_s32(out_shift);
115         out_mult += 4;
116         out_shift += 4;
117         mve_pred16_t p = vctp32q(num_ch_to_process);
118 
119         out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
120         out_0 = vaddq_n_s32(out_0, out_offset);
121         out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
122         out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
123         vstrbq_p_s32(out, out_0, p);
124 
125         out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
126         out_1 = vaddq_n_s32(out_1, out_offset);
127         out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
128         out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
129         vstrbq_p_s32(out + num_ch, out_1, p);
130 
131         out_2 = arm_requantize_mve_32x4(out_2, mult, shift);
132         out_2 = vaddq_n_s32(out_2, out_offset);
133         out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min));
134         out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max));
135         vstrbq_p_s32(out + 2 * num_ch, out_2, p);
136 
137         out_3 = arm_requantize_mve_32x4(out_3, mult, shift);
138         out_3 = vaddq_n_s32(out_3, out_offset);
139         out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min));
140         out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max));
141         vstrbq_p_s32(out + 3 * num_ch, out_3, p);
142     }
143 
144     const int tail_ch = num_ch & 0x3;
145     if (tail_ch != 0)
146     {
147         out -= (4 - tail_ch);
148     }
149 
150     return out + (3 * num_ch);
151 #else
152     (void)lhs;
153     (void)rhs;
154     (void)input_offset;
155     (void)num_ch;
156     (void)out_shift;
157     (void)out_mult;
158     (void)out_offset;
159     (void)activation_min;
160     (void)activation_max;
161     (void)row_x_col;
162     (void)output_bias;
163     (void)out;
164     return NULL;
165 #endif
166 }
167 
168 /**
169  * @} end of NNBasicMath group
170  */
171