1 /*
2 * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 /* ----------------------------------------------------------------------
20 * Project: CMSIS NN Library
21 * Title: arm_elementwise_add_s8
22 * Description: Elementwise add
23 *
24 * $Date: 5 January 2023
25 * $Revision: V.3.1.0
26 *
27 * Target : Arm(R) M-Profile Architecture
28 *
29 * -------------------------------------------------------------------- */
30
31 #include "arm_nnfunctions.h"
32 #include "arm_nnsupportfunctions.h"
33
34 /**
35 * @ingroup Public
36 */
37
38 /**
39 * @addtogroup groupElementwise
40 * @{
41 */
42
43 /*
44 * s8 elementwise add
45 *
46 * Refer header file for details.
47 *
48 */
49
50 /* Note: __SHIFT is expected to be <=0 */
51
arm_elementwise_add_s8(const int8_t * input_1_vect,const int8_t * input_2_vect,const int32_t input_1_offset,const int32_t input_1_mult,const int32_t input_1_shift,const int32_t input_2_offset,const int32_t input_2_mult,const int32_t input_2_shift,const int32_t left_shift,int8_t * output,const int32_t out_offset,const int32_t out_mult,const int32_t out_shift,const int32_t out_activation_min,const int32_t out_activation_max,const int32_t block_size)52 arm_cmsis_nn_status arm_elementwise_add_s8(const int8_t *input_1_vect,
53 const int8_t *input_2_vect,
54 const int32_t input_1_offset,
55 const int32_t input_1_mult,
56 const int32_t input_1_shift,
57 const int32_t input_2_offset,
58 const int32_t input_2_mult,
59 const int32_t input_2_shift,
60 const int32_t left_shift,
61 int8_t *output,
62 const int32_t out_offset,
63 const int32_t out_mult,
64 const int32_t out_shift,
65 const int32_t out_activation_min,
66 const int32_t out_activation_max,
67 const int32_t block_size)
68 {
69 #if defined(ARM_MATH_MVEI)
70 int32_t count = block_size;
71
72 while (count > 0)
73 {
74 int32x4_t vect_1;
75 int32x4_t vect_2;
76
77 mve_pred16_t p = vctp32q((uint32_t)count);
78
79 vect_1 = vldrbq_z_s32(input_1_vect, p);
80 vect_2 = vldrbq_z_s32(input_2_vect, p);
81
82 vect_1 = vaddq_s32(vect_1, vdupq_n_s32(input_1_offset));
83 vect_2 = vaddq_s32(vect_2, vdupq_n_s32(input_2_offset));
84
85 vect_1 = vshlq_r_s32(vect_1, left_shift);
86 vect_2 = vshlq_r_s32(vect_2, left_shift);
87
88 vect_1 = arm_requantize_mve(vect_1, input_1_mult, input_1_shift);
89 vect_2 = arm_requantize_mve(vect_2, input_2_mult, input_2_shift);
90
91 vect_1 = vaddq_s32(vect_1, vect_2);
92 vect_1 = arm_requantize_mve(vect_1, out_mult, out_shift);
93
94 vect_1 = vaddq_n_s32(vect_1, out_offset);
95
96 vect_1 = vmaxq_s32(vect_1, vdupq_n_s32(out_activation_min));
97 vect_1 = vminq_s32(vect_1, vdupq_n_s32(out_activation_max));
98
99 input_1_vect += 4;
100 input_2_vect += 4;
101 vstrbq_p_s32(output, vect_1, p);
102
103 output += 4;
104 count -= 4;
105 }
106 #else
107 int32_t loop_count;
108 int32_t input_1;
109 int32_t input_2;
110 int32_t sum;
111
112 #if defined(ARM_MATH_DSP)
113 int32_t a_1, b_1, a_2, b_2;
114
115 int32_t offset_1_packed, offset_2_packed;
116
117 int8_t r1, r2, r3, r4;
118
119 offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
120 offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
121
122 loop_count = block_size >> 2;
123
124 while (loop_count > 0)
125 {
126 /* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
127 intrinsic */
128 input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
129 input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
130
131 a_1 = SADD16(a_1, offset_1_packed);
132 b_1 = SADD16(b_1, offset_1_packed);
133
134 a_2 = SADD16(a_2, offset_2_packed);
135 b_2 = SADD16(b_2, offset_2_packed);
136
137 /* Sum 1 */
138 input_1 = (b_1 & 0x0FFFF) << left_shift;
139
140 input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
141
142 input_2 = (b_2 & 0x0FFFF) << left_shift;
143 input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
144
145 sum = input_1 + input_2;
146 sum = arm_nn_requantize(sum, out_mult, out_shift);
147 sum += out_offset;
148 sum = MAX(sum, out_activation_min);
149 sum = MIN(sum, out_activation_max);
150 r1 = (int8_t)sum;
151
152 /* Sum 3 */
153 input_1 = ((b_1 >> 16) & 0x0FFFF) << left_shift;
154 input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
155
156 input_2 = ((b_2 >> 16) & 0x0FFFF) << left_shift;
157 input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
158
159 sum = input_1 + input_2;
160 sum = arm_nn_requantize(sum, out_mult, out_shift);
161 sum += out_offset;
162 sum = MAX(sum, out_activation_min);
163 sum = MIN(sum, out_activation_max);
164 r3 = (int8_t)sum;
165
166 /* Sum 2 */
167 input_1 = (a_1 & 0x0FFFF) << left_shift;
168 input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
169
170 input_2 = (a_2 & 0x0FFFF) << left_shift;
171 input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
172
173 sum = input_1 + input_2;
174 sum = arm_nn_requantize(sum, out_mult, out_shift);
175 sum += out_offset;
176 sum = MAX(sum, out_activation_min);
177 sum = MIN(sum, out_activation_max);
178 r2 = (int8_t)sum;
179
180 /* Sum 4 */
181 input_1 = ((a_1 >> 16) & 0x0FFFF) << left_shift;
182 input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
183
184 input_2 = ((a_2 >> 16) & 0x0FFFF) << left_shift;
185 input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
186
187 sum = input_1 + input_2;
188 sum = arm_nn_requantize(sum, out_mult, out_shift);
189 sum += out_offset;
190 sum = MAX(sum, out_activation_min);
191 sum = MIN(sum, out_activation_max);
192 r4 = (int8_t)sum;
193
194 arm_nn_write_s8x4_ia(&output, PACK_S8x4_32x1(r1, r2, r3, r4));
195
196 loop_count--;
197 }
198
199 loop_count = block_size & 0x3;
200 #else
201 loop_count = block_size;
202 #endif
203
204 while (loop_count > 0)
205 {
206 /* C = A + B */
207
208 input_1 = (*input_1_vect++ + input_1_offset) << left_shift;
209 input_2 = (*input_2_vect++ + input_2_offset) << left_shift;
210
211 input_1 = arm_nn_requantize(input_1, input_1_mult, input_1_shift);
212 input_2 = arm_nn_requantize(input_2, input_2_mult, input_2_shift);
213
214 sum = input_1 + input_2;
215 sum = arm_nn_requantize(sum, out_mult, out_shift);
216 sum += out_offset;
217
218 sum = MAX(sum, out_activation_min);
219 sum = MIN(sum, out_activation_max);
220
221 *output++ = (int8_t)sum;
222
223 /* Decrement loop counter */
224 loop_count--;
225 }
226
227 #endif /* ARM_MATH_MVEI */
228
229 return (ARM_CMSIS_NN_SUCCESS);
230 }
231
232 /**
233 * @} end of Doxygen group
234 */
235