1 /******************************************************************************
2  * @file     arm_vec_math.h
3  * @brief    Public header file for CMSIS DSP Library
4  * @version  V1.9.0
5  * @date     23 April 2021
6  * Target Processor: Cortex-M and Cortex-A cores
7  ******************************************************************************/
8 /*
9  * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
10  *
11  * SPDX-License-Identifier: Apache-2.0
12  *
13  * Licensed under the Apache License, Version 2.0 (the License); you may
14  * not use this file except in compliance with the License.
15  * You may obtain a copy of the License at
16  *
17  * www.apache.org/licenses/LICENSE-2.0
18  *
19  * Unless required by applicable law or agreed to in writing, software
20  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
21  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  * See the License for the specific language governing permissions and
23  * limitations under the License.
24  */
25 
26 #ifndef _ARM_VEC_MATH_H
27 #define _ARM_VEC_MATH_H
28 
29 #include "arm_math_types.h"
30 #include "arm_common_tables.h"
31 #include "arm_helium_utils.h"
32 
33 #ifdef   __cplusplus
34 extern "C"
35 {
36 #endif
37 
38 #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
39 
40 #define INV_NEWTON_INIT_F32         0x7EF127EA
41 
42 static const float32_t __logf_rng_f32=0.693147180f;
43 
44 
45 /* fast inverse approximation (3x newton) */
vrecip_medprec_f32(f32x4_t x)46 __STATIC_INLINE f32x4_t vrecip_medprec_f32(
47     f32x4_t x)
48 {
49     q31x4_t         m;
50     f32x4_t         b;
51     any32x4_t       xinv;
52     f32x4_t         ax = vabsq(x);
53 
54     xinv.f = ax;
55     m = 0x3F800000 - (xinv.i & 0x7F800000);
56     xinv.i = xinv.i + m;
57     xinv.f = 1.41176471f - 0.47058824f * xinv.f;
58     xinv.i = xinv.i + m;
59 
60     b = 2.0f - xinv.f * ax;
61     xinv.f = xinv.f * b;
62 
63     b = 2.0f - xinv.f * ax;
64     xinv.f = xinv.f * b;
65 
66     b = 2.0f - xinv.f * ax;
67     xinv.f = xinv.f * b;
68 
69     xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
70     /*
71      * restore sign
72      */
73     xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
74 
75     return xinv.f;
76 }
77 
78 /* fast inverse approximation (4x newton) */
vrecip_hiprec_f32(f32x4_t x)79 __STATIC_INLINE f32x4_t vrecip_hiprec_f32(
80     f32x4_t x)
81 {
82     q31x4_t         m;
83     f32x4_t         b;
84     any32x4_t       xinv;
85     f32x4_t         ax = vabsq(x);
86 
87     xinv.f = ax;
88 
89     m = 0x3F800000 - (xinv.i & 0x7F800000);
90     xinv.i = xinv.i + m;
91     xinv.f = 1.41176471f - 0.47058824f * xinv.f;
92     xinv.i = xinv.i + m;
93 
94     b = 2.0f - xinv.f * ax;
95     xinv.f = xinv.f * b;
96 
97     b = 2.0f - xinv.f * ax;
98     xinv.f = xinv.f * b;
99 
100     b = 2.0f - xinv.f * ax;
101     xinv.f = xinv.f * b;
102 
103     b = 2.0f - xinv.f * ax;
104     xinv.f = xinv.f * b;
105 
106     xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
107     /*
108      * restore sign
109      */
110     xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
111 
112     return xinv.f;
113 }
114 
vdiv_f32(f32x4_t num,f32x4_t den)115 __STATIC_INLINE f32x4_t vdiv_f32(
116     f32x4_t num, f32x4_t den)
117 {
118     return vmulq(num, vrecip_hiprec_f32(den));
119 }
120 
121 /**
122   @brief         Single-precision taylor dev.
123   @param[in]     x              f32 quad vector input
124   @param[in]     coeffs         f32 quad vector coeffs
125   @return        destination    f32 quad vector
126  */
127 
vtaylor_polyq_f32(f32x4_t x,const float32_t * coeffs)128 __STATIC_INLINE f32x4_t vtaylor_polyq_f32(
129         f32x4_t           x,
130         const float32_t * coeffs)
131 {
132     f32x4_t         A = vfmasq(vdupq_n_f32(coeffs[4]), x, coeffs[0]);
133     f32x4_t         B = vfmasq(vdupq_n_f32(coeffs[6]), x, coeffs[2]);
134     f32x4_t         C = vfmasq(vdupq_n_f32(coeffs[5]), x, coeffs[1]);
135     f32x4_t         D = vfmasq(vdupq_n_f32(coeffs[7]), x, coeffs[3]);
136     f32x4_t         x2 = vmulq(x, x);
137     f32x4_t         x4 = vmulq(x2, x2);
138     f32x4_t         res = vfmaq(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4);
139 
140     return res;
141 }
142 
vmant_exp_f32(f32x4_t x,int32x4_t * e)143 __STATIC_INLINE f32x4_t vmant_exp_f32(
144     f32x4_t     x,
145     int32x4_t * e)
146 {
147     any32x4_t       r;
148     int32x4_t       n;
149 
150     r.f = x;
151     n = r.i >> 23;
152     n = n - 127;
153     r.i = r.i - (n << 23);
154 
155     *e = n;
156     return r.f;
157 }
158 
159 
vlogq_f32(f32x4_t vecIn)160 __STATIC_INLINE f32x4_t vlogq_f32(f32x4_t vecIn)
161 {
162     q31x4_t         vecExpUnBiased;
163     f32x4_t         vecTmpFlt0, vecTmpFlt1;
164     f32x4_t         vecAcc0, vecAcc1, vecAcc2, vecAcc3;
165     f32x4_t         vecExpUnBiasedFlt;
166 
167     /*
168      * extract exponent
169      */
170     vecTmpFlt1 = vmant_exp_f32(vecIn, &vecExpUnBiased);
171 
172     vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
173     /*
174      * a = (__logf_lut_f32[4] * r.f) + (__logf_lut_f32[0]);
175      */
176     vecAcc0 = vdupq_n_f32(__logf_lut_f32[0]);
177     vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f32[4]);
178     /*
179      * b = (__logf_lut_f32[6] * r.f) + (__logf_lut_f32[2]);
180      */
181     vecAcc1 = vdupq_n_f32(__logf_lut_f32[2]);
182     vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f32[6]);
183     /*
184      * c = (__logf_lut_f32[5] * r.f) + (__logf_lut_f32[1]);
185      */
186     vecAcc2 = vdupq_n_f32(__logf_lut_f32[1]);
187     vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f32[5]);
188     /*
189      * d = (__logf_lut_f32[7] * r.f) + (__logf_lut_f32[3]);
190      */
191     vecAcc3 = vdupq_n_f32(__logf_lut_f32[3]);
192     vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f32[7]);
193     /*
194      * a = a + b * xx;
195      */
196     vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
197     /*
198      * c = c + d * xx;
199      */
200     vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
201     /*
202      * xx = xx * xx;
203      */
204     vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
205     vecExpUnBiasedFlt = vcvtq_f32_s32(vecExpUnBiased);
206     /*
207      * r.f = a + c * xx;
208      */
209     vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
210     /*
211      * add exponent
212      * r.f = r.f + ((float32_t) m) * __logf_rng_f32;
213      */
214     vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f32);
215     // set log0 down to -inf
216     vecAcc0 = vdupq_m(vecAcc0, -INFINITY, vcmpeqq(vecIn, 0.0f));
217     return vecAcc0;
218 }
219 
vexpq_f32(f32x4_t x)220 __STATIC_INLINE f32x4_t vexpq_f32(
221     f32x4_t x)
222 {
223     // Perform range reduction [-log(2),log(2)]
224     int32x4_t       m = vcvtq_s32_f32(vmulq_n_f32(x, 1.4426950408f));
225     f32x4_t         val = vfmsq_f32(x, vcvtq_f32_s32(m), vdupq_n_f32(0.6931471805f));
226 
227     // Polynomial Approximation
228     f32x4_t         poly = vtaylor_polyq_f32(val, exp_tab);
229 
230     // Reconstruct
231     poly = (f32x4_t) (vqaddq_s32((q31x4_t) (poly), vqshlq_n_s32(m, 23)));
232 
233     poly = vdupq_m(poly, 0.0f, vcmpltq_n_s32(m, -126));
234     return poly;
235 }
236 
arm_vec_exponent_f32(f32x4_t x,int32_t nb)237 __STATIC_INLINE f32x4_t arm_vec_exponent_f32(f32x4_t x, int32_t nb)
238 {
239     f32x4_t         r = x;
240     nb--;
241     while (nb > 0) {
242         r = vmulq(r, x);
243         nb--;
244     }
245     return (r);
246 }
247 
vrecip_f32(f32x4_t vecIn)248 __STATIC_INLINE f32x4_t vrecip_f32(f32x4_t vecIn)
249 {
250     f32x4_t     vecSx, vecW, vecTmp;
251     any32x4_t   v;
252 
253     vecSx = vabsq(vecIn);
254 
255     v.f = vecIn;
256     v.i = vsubq(vdupq_n_s32(INV_NEWTON_INIT_F32), v.i);
257 
258     vecW = vmulq(vecSx, v.f);
259 
260     // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
261     vecTmp = vsubq(vdupq_n_f32(8.0f), vecW);
262     vecTmp = vfmasq(vecW, vecTmp, -28.0f);
263     vecTmp = vfmasq(vecW, vecTmp, 56.0f);
264     vecTmp = vfmasq(vecW, vecTmp, -70.0f);
265     vecTmp = vfmasq(vecW, vecTmp, 56.0f);
266     vecTmp = vfmasq(vecW, vecTmp, -28.0f);
267     vecTmp = vfmasq(vecW, vecTmp, 8.0f);
268     v.f = vmulq(v.f,  vecTmp);
269 
270     v.f = vdupq_m(v.f, INFINITY, vcmpeqq(vecIn, 0.0f));
271     /*
272      * restore sign
273      */
274     v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
275     return v.f;
276 }
277 
vtanhq_f32(f32x4_t val)278 __STATIC_INLINE f32x4_t vtanhq_f32(
279     f32x4_t val)
280 {
281     f32x4_t         x =
282         vminnmq_f32(vmaxnmq_f32(val, vdupq_n_f32(-10.f)), vdupq_n_f32(10.0f));
283     f32x4_t         exp2x = vexpq_f32(vmulq_n_f32(x, 2.f));
284     f32x4_t         num = vsubq_n_f32(exp2x, 1.f);
285     f32x4_t         den = vaddq_n_f32(exp2x, 1.f);
286     f32x4_t         tanh = vmulq_f32(num, vrecip_f32(den));
287     return tanh;
288 }
289 
vpowq_f32(f32x4_t val,f32x4_t n)290 __STATIC_INLINE f32x4_t vpowq_f32(
291     f32x4_t val,
292     f32x4_t n)
293 {
294     return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
295 }
296 
297 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
298 
299 #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
300 #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
301 
302 #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
303 
304 #include "NEMath.h"
305 /**
306  * @brief Vectorized integer exponentiation
307  * @param[in]    x           value
308  * @param[in]    nb          integer exponent >= 1
309  * @return x^nb
310  *
311  */
arm_vec_exponent_f32(float32x4_t x,int32_t nb)312 __STATIC_INLINE  float32x4_t arm_vec_exponent_f32(float32x4_t x, int32_t nb)
313 {
314     float32x4_t r = x;
315     nb --;
316     while(nb > 0)
317     {
318         r = vmulq_f32(r , x);
319         nb--;
320     }
321     return(r);
322 }
323 
324 
__arm_vec_sqrt_f32_neon(float32x4_t x)325 __STATIC_INLINE float32x4_t __arm_vec_sqrt_f32_neon(float32x4_t  x)
326 {
327     float32x4_t x1 = vmaxq_f32(x, vdupq_n_f32(FLT_MIN));
328     float32x4_t e = vrsqrteq_f32(x1);
329     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
330     e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
331     return vmulq_f32(x, e);
332 }
333 
__arm_vec_sqrt_q15_neon(int16x8_t vec)334 __STATIC_INLINE int16x8_t __arm_vec_sqrt_q15_neon(int16x8_t vec)
335 {
336     float32x4_t tempF;
337     int32x4_t tempHI,tempLO;
338 
339     tempLO = vmovl_s16(vget_low_s16(vec));
340     tempF = vcvtq_n_f32_s32(tempLO,15);
341     tempF = __arm_vec_sqrt_f32_neon(tempF);
342     tempLO = vcvtq_n_s32_f32(tempF,15);
343 
344     tempHI = vmovl_s16(vget_high_s16(vec));
345     tempF = vcvtq_n_f32_s32(tempHI,15);
346     tempF = __arm_vec_sqrt_f32_neon(tempF);
347     tempHI = vcvtq_n_s32_f32(tempF,15);
348 
349     return(vcombine_s16(vqmovn_s32(tempLO),vqmovn_s32(tempHI)));
350 }
351 
__arm_vec_sqrt_q31_neon(int32x4_t vec)352 __STATIC_INLINE int32x4_t __arm_vec_sqrt_q31_neon(int32x4_t vec)
353 {
354   float32x4_t temp;
355 
356   temp = vcvtq_n_f32_s32(vec,31);
357   temp = __arm_vec_sqrt_f32_neon(temp);
358   return(vcvtq_n_s32_f32(temp,31));
359 }
360 
361 #endif /*  (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) */
362 
363 #ifdef   __cplusplus
364 }
365 #endif
366 
367 
368 #endif /* _ARM_VEC_MATH_H */
369 
370 /**
371  *
372  * End of file.
373  */
374