1 /*
2  * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_nn_depthwise_conv_nt_t_s16.c
22  * Description:  Depthwise convolution on matrices with no padding.
23  *
24  * $Date:        26 October 2022
25  * $Revision:    V.1.0.1
26  *
27  * Target Processor:  Cortex-M processors with MVE extension
28  * -------------------------------------------------------------------- */
29 
30 #include "arm_nnsupportfunctions.h"
31 
32 /**
33  * @ingroup groupSupport
34  */
35 
36 /**
37  * @addtogroup supportConvolution
38  * @{
39  */
40 
41 /*
42  * Depthwise convolution of rhs matrix with 4 lhs matrices with no padding. Dimensions are the same for lhs and rhs.
43  *
44  * Refer header file for details.
45  *
46  */
arm_nn_depthwise_conv_nt_t_s16(const int16_t * lhs,const int8_t * rhs,const uint16_t num_ch,const int32_t * out_shift,const int32_t * out_mult,const int32_t activation_min,const int32_t activation_max,const uint16_t row_x_col,const int64_t * const output_bias,int16_t * out)47 int16_t *arm_nn_depthwise_conv_nt_t_s16(const int16_t *lhs,
48                                         const int8_t *rhs,
49                                         const uint16_t num_ch,
50                                         const int32_t *out_shift,
51                                         const int32_t *out_mult,
52                                         const int32_t activation_min,
53                                         const int32_t activation_max,
54                                         const uint16_t row_x_col,
55                                         const int64_t *const output_bias,
56                                         int16_t *out)
57 {
58 #if defined(ARM_MATH_MVEI)
59 
60     const int64_t *bias = output_bias;
61     int32_t loop_count = (num_ch + 3) / 4;
62     uint32_t num_ch_to_process = num_ch;
63 
64     for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
65          num_ch_to_process -= 4, offset += 4, out += 4, i_loop_cnt++)
66     {
67         const int8_t *rhs_0 = rhs + offset;
68         const int16_t *lhs_0 = lhs + offset;
69         const int16_t *lhs_1 = lhs + row_x_col * num_ch + offset;
70         const int16_t *lhs_2 = lhs + (row_x_col * num_ch * 2) + offset;
71         const int16_t *lhs_3 = lhs + (row_x_col * num_ch * 3) + offset;
72 
73         int32x4_t out_0 = vdupq_n_s32(0);
74         int32x4_t out_1 = vdupq_n_s32(0);
75         int32x4_t out_2 = vdupq_n_s32(0);
76         int32x4_t out_3 = vdupq_n_s32(0);
77 
78         for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
79         {
80             const int32x4_t ker_0 = vldrbq_s32(rhs_0);
81 
82             int32x4_t ip_0 = vldrhq_s32(lhs_0);
83             out_0 += vmulq_s32(ip_0, ker_0);
84 
85             int32x4_t ip_1 = vldrhq_s32(lhs_1);
86             out_1 += vmulq_s32(ip_1, ker_0);
87 
88             int32x4_t ip_2 = vldrhq_s32(lhs_2);
89             out_2 += vmulq_s32(ip_2, ker_0);
90 
91             int32x4_t ip_3 = vldrhq_s32(lhs_3);
92             out_3 += vmulq_s32(ip_3, ker_0);
93 
94             lhs_0 += num_ch;
95             lhs_1 += num_ch;
96             lhs_2 += num_ch;
97             lhs_3 += num_ch;
98 
99             rhs_0 += num_ch;
100         }
101 
102         for (int i_requantize = 0; i_requantize < 4; i_requantize++)
103         {
104             int32_t reduced_multiplier = REDUCE_MULTIPLIER(out_mult[i_requantize]);
105             int32_t shift = out_shift[i_requantize];
106             int64_t in_requantize_0 = (int64_t)out_0[i_requantize];
107             int64_t in_requantize_1 = (int64_t)out_1[i_requantize];
108             int64_t in_requantize_2 = (int64_t)out_2[i_requantize];
109             int64_t in_requantize_3 = (int64_t)out_3[i_requantize];
110 
111             if (bias)
112             {
113                 in_requantize_0 += *bias;
114                 in_requantize_1 += *bias;
115                 in_requantize_2 += *bias;
116                 in_requantize_3 += *bias;
117                 bias++;
118             }
119 
120             out_0[i_requantize] = arm_nn_requantize_s64(in_requantize_0, reduced_multiplier, shift);
121             out_1[i_requantize] = arm_nn_requantize_s64(in_requantize_1, reduced_multiplier, shift);
122             out_2[i_requantize] = arm_nn_requantize_s64(in_requantize_2, reduced_multiplier, shift);
123             out_3[i_requantize] = arm_nn_requantize_s64(in_requantize_3, reduced_multiplier, shift);
124         }
125 
126         mve_pred16_t p = vctp32q(num_ch_to_process);
127 
128         out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
129         out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
130         vstrhq_p_s32(out, out_0, p);
131 
132         out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
133         out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
134         vstrhq_p_s32(out + num_ch, out_1, p);
135 
136         out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min));
137         out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max));
138         vstrhq_p_s32(out + 2 * num_ch, out_2, p);
139 
140         out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min));
141         out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max));
142         vstrhq_p_s32(out + 3 * num_ch, out_3, p);
143 
144         out_mult += 4;
145         out_shift += 4;
146     }
147     const int tail_ch = num_ch & 0x3;
148     if (tail_ch != 0)
149     {
150         out -= (4 - tail_ch);
151     }
152 
153     return out + (3 * num_ch);
154 #else
155     (void)lhs;
156     (void)rhs;
157     (void)num_ch;
158     (void)out_shift;
159     (void)out_mult;
160     (void)activation_min;
161     (void)activation_max;
162     (void)row_x_col;
163     (void)output_bias;
164     (void)out;
165     return NULL;
166 #endif
167 }
168 
169 /**
170  * @} end of Doxygen group
171  */
172