1 /*
2  * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_nn_mat_mult_s8.c
22  * Description:  General Matrix-multiplication function
23  *
24  * $Date:        09. October 2020
25  * $Revision:    V.2.0.5
26  *
27  * Target Processor:  Cortex-M cores
28  * -------------------------------------------------------------------- */
29 
30 #include "arm_nnsupportfunctions.h"
31 
32 /*
33  * s8 General matrix multiplication function with per-channel requantization for upto 4 column batches.
34  *
35  * Refer header file for details.
36  *
37  */
38 
arm_nn_mat_mult_s8(const q7_t * input_row,const q7_t * input_col,const uint16_t output_ch,const uint16_t col_batches,const int32_t * output_shift,const int32_t * output_mult,const int32_t out_offset,const int32_t col_offset,const int32_t row_offset,const int16_t activation_min,const int16_t activation_max,const uint16_t row_len,const int32_t * const bias,q7_t * out)39 q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
40                          const q7_t *input_col,
41                          const uint16_t output_ch,
42                          const uint16_t col_batches,
43                          const int32_t *output_shift,
44                          const int32_t *output_mult,
45                          const int32_t out_offset,
46                          const int32_t col_offset,
47                          const int32_t row_offset,
48                          const int16_t activation_min,
49                          const int16_t activation_max,
50                          const uint16_t row_len,
51                          const int32_t *const bias,
52                          q7_t *out)
53 {
54 #if defined(ARM_MATH_MVEI)
55     (void)row_offset;
56     if (col_batches == 4)
57     {
58         for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
59         {
60             int32_t row_len_tmp = row_len;
61             const int8_t *ip_r0 = input_row + (i_out_ch * row_len);
62             const int8_t *ip_c0 = input_col;
63             const int8_t *ip_c1 = input_col + row_len;
64             const int8_t *ip_c2 = input_col + (2 * row_len);
65             const int8_t *ip_c3 = input_col + (3 * row_len);
66 
67             int32_t acc_0 = 0;
68             int32_t acc_1 = 0;
69             int32_t acc_2 = 0;
70             int32_t acc_3 = 0;
71             const int32_t row_loop_cnt = (row_len + 7) / 8;
72 
73             for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
74             {
75                 mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
76                 const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
77                 row_len_tmp -= 8;
78 
79                 int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
80                 ip_r0 += 8;
81 
82                 int16x8_t c0 = vldrbq_z_s16(ip_c0, p);
83                 ip_c0 += 8;
84                 c0 = vaddq_m_s16(vuninitializedq_s16(), c0, offset, p);
85 
86                 int16x8_t c1 = vldrbq_z_s16(ip_c1, p);
87                 ip_c1 += 8;
88                 c1 = vaddq_m_s16(vuninitializedq_s16(), c1, offset, p);
89 
90                 int16x8_t c2 = vldrbq_z_s16(ip_c2, p);
91                 ip_c2 += 8;
92                 c2 = vaddq_m_s16(vuninitializedq_s16(), c2, offset, p);
93 
94                 int16x8_t c3 = vldrbq_z_s16(ip_c3, p);
95                 ip_c3 += 8;
96                 c3 = vaddq_m_s16(vuninitializedq_s16(), c3, offset, p);
97 
98                 acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p);
99                 acc_1 = vmladavaq_p_s16(acc_1, r0, c1, p);
100                 acc_2 = vmladavaq_p_s16(acc_2, r0, c2, p);
101                 acc_3 = vmladavaq_p_s16(acc_3, r0, c3, p);
102             }
103 
104             int32x4_t res = {acc_0, acc_1, acc_2, acc_3};
105             if (bias)
106             {
107                 res = vaddq_n_s32(res, bias[i_out_ch]);
108             }
109             res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
110             res = vaddq_n_s32(res, out_offset);
111 
112             res = vmaxq_s32(res, vdupq_n_s32(activation_min));
113             res = vminq_s32(res, vdupq_n_s32(activation_max));
114 
115             const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
116             vstrbq_scatter_offset_s32(&out[i_out_ch], scatter_offset, res);
117         }
118         out += 4 * output_ch;
119     }
120     else
121     {
122         for (int i_col_batch = (col_batches & ~0x3); i_col_batch < (col_batches & 0x3); i_col_batch++)
123         {
124             for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
125             {
126                 int32_t row_len_tmp = row_len;
127 
128                 const int8_t *ip_r0 = input_row + (i_out_ch * row_len);
129                 const int8_t *ip_c0 = input_col + (i_col_batch * row_len);
130                 int32_t acc_0 = 0;
131                 const int32_t row_loop_cnt = (row_len + 7) / 8;
132 
133                 for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
134                 {
135                     const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
136                     const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
137                     row_len_tmp -= 8;
138 
139                     int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
140                     ip_r0 += 8;
141                     int16x8_t c0 = vldrbq_z_s16(ip_c0, p);
142                     ip_c0 += 8;
143 
144                     c0 = vaddq_m_s16(vuninitializedq_s16(), c0, offset, p);
145                     acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p);
146                 }
147 
148                 if (bias)
149                 {
150                     acc_0 += bias[i_out_ch];
151                 }
152                 acc_0 = arm_nn_requantize(acc_0, output_mult[i_out_ch], output_shift[i_out_ch]);
153                 acc_0 += out_offset;
154                 acc_0 = MAX(acc_0, activation_min);
155                 acc_0 = MIN(acc_0, activation_max);
156                 out[i_out_ch] = (q7_t)acc_0;
157             }
158             out += output_ch;
159         }
160     }
161     return out;
162 
163 #else
164     (void)input_row;
165     (void)input_col;
166     (void)output_ch;
167     (void)col_batches;
168     (void)output_shift;
169     (void)output_mult;
170     (void)out_offset;
171     (void)col_offset;
172     (void)row_offset;
173     (void)activation_min;
174     (void)activation_max;
175     (void)row_len;
176     (void)bias;
177     (void)out;
178     return NULL;
179 #endif
180 }
181