1 /******************************************************************************
2 * @file arm_vec_math.h
3 * @brief Public header file for CMSIS DSP Library
4 * @version V1.9.0
5 * @date 23 April 2021
6 * Target Processor: Cortex-M and Cortex-A cores
7 ******************************************************************************/
8 /*
9 * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
10 *
11 * SPDX-License-Identifier: Apache-2.0
12 *
13 * Licensed under the Apache License, Version 2.0 (the License); you may
14 * not use this file except in compliance with the License.
15 * You may obtain a copy of the License at
16 *
17 * www.apache.org/licenses/LICENSE-2.0
18 *
19 * Unless required by applicable law or agreed to in writing, software
20 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
21 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 * See the License for the specific language governing permissions and
23 * limitations under the License.
24 */
25
26 #ifndef _ARM_VEC_MATH_H
27 #define _ARM_VEC_MATH_H
28
29 #include "arm_math_types.h"
30 #include "arm_common_tables.h"
31 #include "arm_helium_utils.h"
32
33 #ifdef __cplusplus
34 extern "C"
35 {
36 #endif
37
38 #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
39
40 #define INV_NEWTON_INIT_F32 0x7EF127EA
41
42 static const float32_t __logf_rng_f32=0.693147180f;
43
44
45 /* fast inverse approximation (3x newton) */
vrecip_medprec_f32(f32x4_t x)46 __STATIC_INLINE f32x4_t vrecip_medprec_f32(
47 f32x4_t x)
48 {
49 q31x4_t m;
50 f32x4_t b;
51 any32x4_t xinv;
52 f32x4_t ax = vabsq(x);
53
54 xinv.f = ax;
55 m = 0x3F800000 - (xinv.i & 0x7F800000);
56 xinv.i = xinv.i + m;
57 xinv.f = 1.41176471f - 0.47058824f * xinv.f;
58 xinv.i = xinv.i + m;
59
60 b = 2.0f - xinv.f * ax;
61 xinv.f = xinv.f * b;
62
63 b = 2.0f - xinv.f * ax;
64 xinv.f = xinv.f * b;
65
66 b = 2.0f - xinv.f * ax;
67 xinv.f = xinv.f * b;
68
69 xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
70 /*
71 * restore sign
72 */
73 xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
74
75 return xinv.f;
76 }
77
78 /* fast inverse approximation (4x newton) */
vrecip_hiprec_f32(f32x4_t x)79 __STATIC_INLINE f32x4_t vrecip_hiprec_f32(
80 f32x4_t x)
81 {
82 q31x4_t m;
83 f32x4_t b;
84 any32x4_t xinv;
85 f32x4_t ax = vabsq(x);
86
87 xinv.f = ax;
88
89 m = 0x3F800000 - (xinv.i & 0x7F800000);
90 xinv.i = xinv.i + m;
91 xinv.f = 1.41176471f - 0.47058824f * xinv.f;
92 xinv.i = xinv.i + m;
93
94 b = 2.0f - xinv.f * ax;
95 xinv.f = xinv.f * b;
96
97 b = 2.0f - xinv.f * ax;
98 xinv.f = xinv.f * b;
99
100 b = 2.0f - xinv.f * ax;
101 xinv.f = xinv.f * b;
102
103 b = 2.0f - xinv.f * ax;
104 xinv.f = xinv.f * b;
105
106 xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
107 /*
108 * restore sign
109 */
110 xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
111
112 return xinv.f;
113 }
114
vdiv_f32(f32x4_t num,f32x4_t den)115 __STATIC_INLINE f32x4_t vdiv_f32(
116 f32x4_t num, f32x4_t den)
117 {
118 return vmulq(num, vrecip_hiprec_f32(den));
119 }
120
121 /**
122 @brief Single-precision taylor dev.
123 @param[in] x f32 quad vector input
124 @param[in] coeffs f32 quad vector coeffs
125 @return destination f32 quad vector
126 */
127
vtaylor_polyq_f32(f32x4_t x,const float32_t * coeffs)128 __STATIC_INLINE f32x4_t vtaylor_polyq_f32(
129 f32x4_t x,
130 const float32_t * coeffs)
131 {
132 f32x4_t A = vfmasq(vdupq_n_f32(coeffs[4]), x, coeffs[0]);
133 f32x4_t B = vfmasq(vdupq_n_f32(coeffs[6]), x, coeffs[2]);
134 f32x4_t C = vfmasq(vdupq_n_f32(coeffs[5]), x, coeffs[1]);
135 f32x4_t D = vfmasq(vdupq_n_f32(coeffs[7]), x, coeffs[3]);
136 f32x4_t x2 = vmulq(x, x);
137 f32x4_t x4 = vmulq(x2, x2);
138 f32x4_t res = vfmaq(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4);
139
140 return res;
141 }
142
vmant_exp_f32(f32x4_t x,int32x4_t * e)143 __STATIC_INLINE f32x4_t vmant_exp_f32(
144 f32x4_t x,
145 int32x4_t * e)
146 {
147 any32x4_t r;
148 int32x4_t n;
149
150 r.f = x;
151 n = r.i >> 23;
152 n = n - 127;
153 r.i = r.i - (n << 23);
154
155 *e = n;
156 return r.f;
157 }
158
159
vlogq_f32(f32x4_t vecIn)160 __STATIC_INLINE f32x4_t vlogq_f32(f32x4_t vecIn)
161 {
162 q31x4_t vecExpUnBiased;
163 f32x4_t vecTmpFlt0, vecTmpFlt1;
164 f32x4_t vecAcc0, vecAcc1, vecAcc2, vecAcc3;
165 f32x4_t vecExpUnBiasedFlt;
166
167 /*
168 * extract exponent
169 */
170 vecTmpFlt1 = vmant_exp_f32(vecIn, &vecExpUnBiased);
171
172 vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
173 /*
174 * a = (__logf_lut_f32[4] * r.f) + (__logf_lut_f32[0]);
175 */
176 vecAcc0 = vdupq_n_f32(__logf_lut_f32[0]);
177 vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f32[4]);
178 /*
179 * b = (__logf_lut_f32[6] * r.f) + (__logf_lut_f32[2]);
180 */
181 vecAcc1 = vdupq_n_f32(__logf_lut_f32[2]);
182 vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f32[6]);
183 /*
184 * c = (__logf_lut_f32[5] * r.f) + (__logf_lut_f32[1]);
185 */
186 vecAcc2 = vdupq_n_f32(__logf_lut_f32[1]);
187 vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f32[5]);
188 /*
189 * d = (__logf_lut_f32[7] * r.f) + (__logf_lut_f32[3]);
190 */
191 vecAcc3 = vdupq_n_f32(__logf_lut_f32[3]);
192 vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f32[7]);
193 /*
194 * a = a + b * xx;
195 */
196 vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
197 /*
198 * c = c + d * xx;
199 */
200 vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
201 /*
202 * xx = xx * xx;
203 */
204 vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
205 vecExpUnBiasedFlt = vcvtq_f32_s32(vecExpUnBiased);
206 /*
207 * r.f = a + c * xx;
208 */
209 vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
210 /*
211 * add exponent
212 * r.f = r.f + ((float32_t) m) * __logf_rng_f32;
213 */
214 vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f32);
215 // set log0 down to -inf
216 vecAcc0 = vdupq_m(vecAcc0, -INFINITY, vcmpeqq(vecIn, 0.0f));
217 return vecAcc0;
218 }
219
vexpq_f32(f32x4_t x)220 __STATIC_INLINE f32x4_t vexpq_f32(
221 f32x4_t x)
222 {
223 // Perform range reduction [-log(2),log(2)]
224 int32x4_t m = vcvtq_s32_f32(vmulq_n_f32(x, 1.4426950408f));
225 f32x4_t val = vfmsq_f32(x, vcvtq_f32_s32(m), vdupq_n_f32(0.6931471805f));
226
227 // Polynomial Approximation
228 f32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
229
230 // Reconstruct
231 poly = (f32x4_t) (vqaddq_s32((q31x4_t) (poly), vqshlq_n_s32(m, 23)));
232
233 poly = vdupq_m(poly, 0.0f, vcmpltq_n_s32(m, -126));
234 return poly;
235 }
236
arm_vec_exponent_f32(f32x4_t x,int32_t nb)237 __STATIC_INLINE f32x4_t arm_vec_exponent_f32(f32x4_t x, int32_t nb)
238 {
239 f32x4_t r = x;
240 nb--;
241 while (nb > 0) {
242 r = vmulq(r, x);
243 nb--;
244 }
245 return (r);
246 }
247
vrecip_f32(f32x4_t vecIn)248 __STATIC_INLINE f32x4_t vrecip_f32(f32x4_t vecIn)
249 {
250 f32x4_t vecSx, vecW, vecTmp;
251 any32x4_t v;
252
253 vecSx = vabsq(vecIn);
254
255 v.f = vecIn;
256 v.i = vsubq(vdupq_n_s32(INV_NEWTON_INIT_F32), v.i);
257
258 vecW = vmulq(vecSx, v.f);
259
260 // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
261 vecTmp = vsubq(vdupq_n_f32(8.0f), vecW);
262 vecTmp = vfmasq(vecW, vecTmp, -28.0f);
263 vecTmp = vfmasq(vecW, vecTmp, 56.0f);
264 vecTmp = vfmasq(vecW, vecTmp, -70.0f);
265 vecTmp = vfmasq(vecW, vecTmp, 56.0f);
266 vecTmp = vfmasq(vecW, vecTmp, -28.0f);
267 vecTmp = vfmasq(vecW, vecTmp, 8.0f);
268 v.f = vmulq(v.f, vecTmp);
269
270 v.f = vdupq_m(v.f, INFINITY, vcmpeqq(vecIn, 0.0f));
271 /*
272 * restore sign
273 */
274 v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
275 return v.f;
276 }
277
vtanhq_f32(f32x4_t val)278 __STATIC_INLINE f32x4_t vtanhq_f32(
279 f32x4_t val)
280 {
281 f32x4_t x =
282 vminnmq_f32(vmaxnmq_f32(val, vdupq_n_f32(-10.f)), vdupq_n_f32(10.0f));
283 f32x4_t exp2x = vexpq_f32(vmulq_n_f32(x, 2.f));
284 f32x4_t num = vsubq_n_f32(exp2x, 1.f);
285 f32x4_t den = vaddq_n_f32(exp2x, 1.f);
286 f32x4_t tanh = vmulq_f32(num, vrecip_f32(den));
287 return tanh;
288 }
289
vpowq_f32(f32x4_t val,f32x4_t n)290 __STATIC_INLINE f32x4_t vpowq_f32(
291 f32x4_t val,
292 f32x4_t n)
293 {
294 return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
295 }
296
297 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
298
299 #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
300 #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
301
302 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
303
304 #include "NEMath.h"
305 /**
306 * @brief Vectorized integer exponentiation
307 * @param[in] x value
308 * @param[in] nb integer exponent >= 1
309 * @return x^nb
310 *
311 */
arm_vec_exponent_f32(float32x4_t x,int32_t nb)312 __STATIC_INLINE float32x4_t arm_vec_exponent_f32(float32x4_t x, int32_t nb)
313 {
314 float32x4_t r = x;
315 nb --;
316 while(nb > 0)
317 {
318 r = vmulq_f32(r , x);
319 nb--;
320 }
321 return(r);
322 }
323
324
__arm_vec_sqrt_f32_neon(float32x4_t x)325 __STATIC_INLINE float32x4_t __arm_vec_sqrt_f32_neon(float32x4_t x)
326 {
327 float32x4_t x1 = vmaxq_f32(x, vdupq_n_f32(FLT_MIN));
328 float32x4_t e = vrsqrteq_f32(x1);
329 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
330 e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
331 return vmulq_f32(x, e);
332 }
333
__arm_vec_sqrt_q15_neon(int16x8_t vec)334 __STATIC_INLINE int16x8_t __arm_vec_sqrt_q15_neon(int16x8_t vec)
335 {
336 float32x4_t tempF;
337 int32x4_t tempHI,tempLO;
338
339 tempLO = vmovl_s16(vget_low_s16(vec));
340 tempF = vcvtq_n_f32_s32(tempLO,15);
341 tempF = __arm_vec_sqrt_f32_neon(tempF);
342 tempLO = vcvtq_n_s32_f32(tempF,15);
343
344 tempHI = vmovl_s16(vget_high_s16(vec));
345 tempF = vcvtq_n_f32_s32(tempHI,15);
346 tempF = __arm_vec_sqrt_f32_neon(tempF);
347 tempHI = vcvtq_n_s32_f32(tempF,15);
348
349 return(vcombine_s16(vqmovn_s32(tempLO),vqmovn_s32(tempHI)));
350 }
351
__arm_vec_sqrt_q31_neon(int32x4_t vec)352 __STATIC_INLINE int32x4_t __arm_vec_sqrt_q31_neon(int32x4_t vec)
353 {
354 float32x4_t temp;
355
356 temp = vcvtq_n_f32_s32(vec,31);
357 temp = __arm_vec_sqrt_f32_neon(temp);
358 return(vcvtq_n_s32_f32(temp,31));
359 }
360
361 #endif /* (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) */
362
363 #ifdef __cplusplus
364 }
365 #endif
366
367
368 #endif /* _ARM_VEC_MATH_H */
369
370 /**
371 *
372 * End of file.
373 */
374