1 /*
2  * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  *
6  * Licensed under the Apache License, Version 2.0 (the License); you may
7  * not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 /* ----------------------------------------------------------------------
20  * Project:      CMSIS NN Library
21  * Title:        arm_elementwise_add_s8
22  * Description:  Elementwise add
23  *
24  * $Date:        5 January 2023
25  * $Revision:    V.3.1.0
26  *
27  * Target :  Arm(R) M-Profile Architecture
28  *
29  * -------------------------------------------------------------------- */
30 
31 #include "arm_nnfunctions.h"
32 #include "arm_nnsupportfunctions.h"
33 
34 /**
35  *  @ingroup Public
36  */
37 
38 /**
39  * @addtogroup groupElementwise
40  * @{
41  */
42 
43 /*
44  * s8 elementwise add
45  *
46  * Refer header file for details.
47  *
48  */
49 
50 /* Note: __SHIFT is expected to be <=0 */
51 
arm_elementwise_add_s8(const int8_t * input_1_vect,const int8_t * input_2_vect,const int32_t input_1_offset,const int32_t input_1_mult,const int32_t input_1_shift,const int32_t input_2_offset,const int32_t input_2_mult,const int32_t input_2_shift,const int32_t left_shift,int8_t * output,const int32_t out_offset,const int32_t out_mult,const int32_t out_shift,const int32_t out_activation_min,const int32_t out_activation_max,const int32_t block_size)52 arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
53                                            const int8_t *input_2_vect,
54                                            const int32_t input_1_offset,
55                                            const int32_t input_1_mult,
56                                            const int32_t input_1_shift,
57                                            const int32_t input_2_offset,
58                                            const int32_t input_2_mult,
59                                            const int32_t input_2_shift,
60                                            const int32_t left_shift,
61                                            int8_t *output,
62                                            const int32_t out_offset,
63                                            const int32_t out_mult,
64                                            const int32_t out_shift,
65                                            const int32_t out_activation_min,
66                                            const int32_t out_activation_max,
67                                            const int32_t block_size)
68 {
69 #if defined(ARM_MATH_MVEI)
70     int32_t count = block_size;
71 
72     while (count > 0)
73     {
74         int32x4_t vect_1;
75         int32x4_t vect_2;
76 
77         mve_pred16_t p = vctp32q((uint32_t)count);
78 
79         vect_1 = vldrbq_z_s32(input_1_vect, p);
80         vect_2 = vldrbq_z_s32(input_2_vect, p);
81 
82         vect_1 = vaddq_s32(vect_1, vdupq_n_s32(input_1_offset));
83         vect_2 = vaddq_s32(vect_2, vdupq_n_s32(input_2_offset));
84 
85         vect_1 = vshlq_r_s32(vect_1, left_shift);
86         vect_2 = vshlq_r_s32(vect_2, left_shift);
87 
88         vect_1 = arm_requantize_mve(vect_1, input_1_mult, input_1_shift);
89         vect_2 = arm_requantize_mve(vect_2, input_2_mult, input_2_shift);
90 
91         vect_1 = vaddq_s32(vect_1, vect_2);
92         vect_1 = arm_requantize_mve(vect_1, out_mult, out_shift);
93 
94         vect_1 = vaddq_n_s32(vect_1, out_offset);
95 
96         vect_1 = vmaxq_s32(vect_1, vdupq_n_s32(out_activation_min));
97         vect_1 = vminq_s32(vect_1, vdupq_n_s32(out_activation_max));
98 
99         input_1_vect += 4;
100         input_2_vect += 4;
101         vstrbq_p_s32(output, vect_1, p);
102 
103         output += 4;
104         count -= 4;
105     }
106 #else
107     int32_t loop_count;
108     int32_t input_1;
109     int32_t input_2;
110     int32_t sum;
111 
112     #if defined(ARM_MATH_DSP)
113     int32_t a_1, b_1, a_2, b_2;
114 
115     int32_t offset_1_packed, offset_2_packed;
116 
117     int8_t r1, r2, r3, r4;
118 
119     offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
120     offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
121 
122     loop_count = block_size >> 2;
123 
124     while (loop_count > 0)
125     {
126         /* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
127            intrinsic */
128         input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
129         input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
130 
131         a_1 = SADD16(a_1, offset_1_packed);
132         b_1 = SADD16(b_1, offset_1_packed);
133 
134         a_2 = SADD16(a_2, offset_2_packed);
135         b_2 = SADD16(b_2, offset_2_packed);
136 
137         /* Sum 1 */
138         input_1 = (b_1 & 0x0FFFF) << left_shift;
139 
140         input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
141 
142         input_2 = (b_2 & 0x0FFFF) << left_shift;
143         input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
144 
145         sum = input_1 + input_2;
146         sum = arm_nn_requantize(sum, out_mult, out_shift);
147         sum += out_offset;
148         sum = MAX(sum, out_activation_min);
149         sum = MIN(sum, out_activation_max);
150         r1 = (int8_t)sum;
151 
152         /* Sum 3 */
153         input_1 = ((b_1 >> 16) & 0x0FFFF) << left_shift;
154         input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
155 
156         input_2 = ((b_2 >> 16) & 0x0FFFF) << left_shift;
157         input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
158 
159         sum = input_1 + input_2;
160         sum = arm_nn_requantize(sum, out_mult, out_shift);
161         sum += out_offset;
162         sum = MAX(sum, out_activation_min);
163         sum = MIN(sum, out_activation_max);
164         r3 = (int8_t)sum;
165 
166         /* Sum 2 */
167         input_1 = (a_1 & 0x0FFFF) << left_shift;
168         input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
169 
170         input_2 = (a_2 & 0x0FFFF) << left_shift;
171         input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
172 
173         sum = input_1 + input_2;
174         sum = arm_nn_requantize(sum, out_mult, out_shift);
175         sum += out_offset;
176         sum = MAX(sum, out_activation_min);
177         sum = MIN(sum, out_activation_max);
178         r2 = (int8_t)sum;
179 
180         /* Sum 4 */
181         input_1 = ((a_1 >> 16) & 0x0FFFF) << left_shift;
182         input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
183 
184         input_2 = ((a_2 >> 16) & 0x0FFFF) << left_shift;
185         input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
186 
187         sum = input_1 + input_2;
188         sum = arm_nn_requantize(sum, out_mult, out_shift);
189         sum += out_offset;
190         sum = MAX(sum, out_activation_min);
191         sum = MIN(sum, out_activation_max);
192         r4 = (int8_t)sum;
193 
194         arm_nn_write_s8x4_ia(&output, PACK_S8x4_32x1(r1, r2, r3, r4));
195 
196         loop_count--;
197     }
198 
199     loop_count = block_size & 0x3;
200     #else
201     loop_count = block_size;
202     #endif
203 
204     while (loop_count > 0)
205     {
206         /* C = A + B */
207 
208         input_1 = (*input_1_vect++ + input_1_offset) << left_shift;
209         input_2 = (*input_2_vect++ + input_2_offset) << left_shift;
210 
211         input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
212         input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
213 
214         sum = input_1 + input_2;
215         sum = arm_nn_requantize(sum, out_mult, out_shift);
216         sum += out_offset;
217 
218         sum = MAX(sum, out_activation_min);
219         sum = MIN(sum, out_activation_max);
220 
221         *output++ = (int8_t)sum;
222 
223         /* Decrement loop counter */
224         loop_count--;
225     }
226 
227 #endif /* ARM_MATH_MVEI */
228 
229     return (ARM_CMSIS_NN_SUCCESS);
230 }
231 
232 /**
233  * @} end of Doxygen group
234  */
235