1 /*
2 * SPDX-FileCopyrightText: Copyright 2010-2020, 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 /* ----------------------------------------------------------------------
20 * Project: CMSIS NN Library
21 * Title: arm_nn_depthwise_conv_nt_t_s8.c
22 * Description: Depthwise convolution on matrices with no padding.
23 *
24 * $Date: 26 October 2022
25 * $Revision: V.2.0.1
26 *
27 * Target Processor: Cortex-M processors with MVE extension.
28 * -------------------------------------------------------------------- */
29
30 #include "arm_nnsupportfunctions.h"
31
32 /**
33 * @ingroup groupSupport
34 */
35
36 /**
37 * @addtogroup supportConvolution
38 * @{
39 */
40
41 /*
42 * Depthwise convolution of rhs matrix with 4 lhs matrices with no padding. Dimensions are the same for lhs and rhs.
43 *
44 * Refer header file for details.
45 *
46 */
arm_nn_depthwise_conv_nt_t_s8(const int8_t * lhs,const int8_t * rhs,const int32_t input_offset,const int32_t active_ch,const int32_t total_ch,const int32_t * out_shift,const int32_t * out_mult,const int32_t out_offset,const int32_t activation_min,const int32_t activation_max,const uint16_t row_x_col,const int32_t * const output_bias,int8_t * out)47 arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const int8_t *lhs,
48 const int8_t *rhs,
49 const int32_t input_offset,
50 const int32_t active_ch,
51 const int32_t total_ch,
52 const int32_t *out_shift,
53 const int32_t *out_mult,
54 const int32_t out_offset,
55 const int32_t activation_min,
56 const int32_t activation_max,
57 const uint16_t row_x_col,
58 const int32_t *const output_bias,
59 int8_t *out)
60 {
61 #if defined(ARM_MATH_MVEI)
62 const int32_t *bias = output_bias;
63 int32_t loop_count = (active_ch + 3) / 4;
64 uint32_t num_ch_to_process = active_ch;
65
66 for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
67 num_ch_to_process -= 4, offset += 4, out += 4, i_loop_cnt++)
68 {
69 int32x4_t out_0 = vdupq_n_s32(0);
70 if (bias)
71 {
72 out_0 = vldrwq_s32(bias);
73 bias += 4;
74 }
75 int32x4_t out_1 = out_0;
76 int32x4_t out_2 = out_0;
77 int32x4_t out_3 = out_0;
78
79 const int8_t *rhs_0 = rhs + offset;
80 const int8_t *lhs_0 = lhs + offset;
81 const int8_t *lhs_1 = lhs + row_x_col * CH_IN_BLOCK_MVE + offset;
82 const int8_t *lhs_2 = lhs + (row_x_col * CH_IN_BLOCK_MVE * 2) + offset;
83 const int8_t *lhs_3 = lhs + (row_x_col * CH_IN_BLOCK_MVE * 3) + offset;
84 int32x4_t ker_sum = vdupq_n_s32(0);
85
86 for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
87 {
88 const int32x4_t ker_0 = vldrbq_s32(rhs_0);
89 ker_sum = vaddq_s32(ker_sum, ker_0);
90
91 int32x4_t ip_0 = vldrbq_s32(lhs_0);
92 out_0 += vmulq_s32(ip_0, ker_0);
93
94 int32x4_t ip_1 = vldrbq_s32(lhs_1);
95 out_1 += vmulq_s32(ip_1, ker_0);
96
97 int32x4_t ip_2 = vldrbq_s32(lhs_2);
98 out_2 += vmulq_s32(ip_2, ker_0);
99
100 int32x4_t ip_3 = vldrbq_s32(lhs_3);
101 out_3 += vmulq_s32(ip_3, ker_0);
102
103 lhs_0 += CH_IN_BLOCK_MVE;
104 lhs_1 += CH_IN_BLOCK_MVE;
105 lhs_2 += CH_IN_BLOCK_MVE;
106 lhs_3 += CH_IN_BLOCK_MVE;
107
108 rhs_0 += total_ch;
109 }
110
111 ker_sum = vmulq_n_s32(ker_sum, input_offset);
112 out_0 = ker_sum + out_0;
113 out_1 = ker_sum + out_1;
114 out_2 = ker_sum + out_2;
115 out_3 = ker_sum + out_3;
116
117 const int32x4_t mult = vldrwq_s32(out_mult);
118 const int32x4_t shift = vldrwq_s32(out_shift);
119 out_mult += 4;
120 out_shift += 4;
121 mve_pred16_t p = vctp32q(num_ch_to_process);
122
123 out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
124 out_0 = vaddq_n_s32(out_0, out_offset);
125 out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
126 out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
127 vstrbq_p_s32(out, out_0, p);
128
129 out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
130 out_1 = vaddq_n_s32(out_1, out_offset);
131 out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
132 out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
133 vstrbq_p_s32(out + total_ch, out_1, p);
134
135 out_2 = arm_requantize_mve_32x4(out_2, mult, shift);
136 out_2 = vaddq_n_s32(out_2, out_offset);
137 out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min));
138 out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max));
139 vstrbq_p_s32(out + 2 * total_ch, out_2, p);
140
141 out_3 = arm_requantize_mve_32x4(out_3, mult, shift);
142 out_3 = vaddq_n_s32(out_3, out_offset);
143 out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min));
144 out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max));
145 vstrbq_p_s32(out + 3 * total_ch, out_3, p);
146 }
147
148 return ARM_CMSIS_NN_SUCCESS;
149 #else
150 (void)lhs;
151 (void)rhs;
152 (void)input_offset;
153 (void)active_ch;
154 (void)total_ch;
155 (void)out_shift;
156 (void)out_mult;
157 (void)out_offset;
158 (void)activation_min;
159 (void)activation_max;
160 (void)row_x_col;
161 (void)output_bias;
162 (void)out;
163 return ARM_CMSIS_NN_NO_IMPL_ERROR;
164 #endif
165 }
166
167 /**
168 * @} end of Doxygen group
169 */
170