1 /******************************************************************************
2  * @file     arm_vec_math_f16.h
3  * @brief    Public header file for CMSIS DSP Library
4  * @version  V1.9.0
5  * @date     23 April 2021
6  * Target Processor: Cortex-M and Cortex-A cores
7  ******************************************************************************/
8 /*
9  * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
10  *
11  * SPDX-License-Identifier: Apache-2.0
12  *
13  * Licensed under the Apache License, Version 2.0 (the License); you may
14  * not use this file except in compliance with the License.
15  * You may obtain a copy of the License at
16  *
17  * www.apache.org/licenses/LICENSE-2.0
18  *
19  * Unless required by applicable law or agreed to in writing, software
20  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
21  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  * See the License for the specific language governing permissions and
23  * limitations under the License.
24  */
25 
26 #ifndef _ARM_VEC_MATH_F16_H
27 #define _ARM_VEC_MATH_F16_H
28 
29 #include "arm_math_types_f16.h"
30 #include "arm_common_tables_f16.h"
31 #include "arm_helium_utils.h"
32 
33 #ifdef   __cplusplus
34 extern "C"
35 {
36 #endif
37 
38 #if defined(ARM_FLOAT16_SUPPORTED)
39 
40 
41 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
42 
43 
44 static const float16_t __logf_rng_f16=0.693147180f16;
45 
46 /* fast inverse approximation (3x newton) */
vrecip_medprec_f16(f16x8_t x)47 __STATIC_INLINE f16x8_t vrecip_medprec_f16(
48     f16x8_t x)
49 {
50     q15x8_t         m;
51     f16x8_t         b;
52     any16x8_t       xinv;
53     f16x8_t         ax = vabsq(x);
54 
55     xinv.f = ax;
56 
57     m = 0x03c00 - (xinv.i & 0x07c00);
58     xinv.i = xinv.i + m;
59     xinv.f = 1.41176471f16 - 0.47058824f16 * xinv.f;
60     xinv.i = xinv.i + m;
61 
62     b = 2.0f16 - xinv.f * ax;
63     xinv.f = xinv.f * b;
64 
65     b = 2.0f16 - xinv.f * ax;
66     xinv.f = xinv.f * b;
67 
68     b = 2.0f16 - xinv.f * ax;
69     xinv.f = xinv.f * b;
70 
71     xinv.f = vdupq_m(xinv.f, F16INFINITY, vcmpeqq(x, 0.0f));
72     /*
73      * restore sign
74      */
75     xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
76 
77     return xinv.f;
78 }
79 
80 /* fast inverse approximation (4x newton) */
vrecip_hiprec_f16(f16x8_t x)81 __STATIC_INLINE f16x8_t vrecip_hiprec_f16(
82     f16x8_t x)
83 {
84     q15x8_t         m;
85     f16x8_t         b;
86     any16x8_t       xinv;
87     f16x8_t         ax = vabsq(x);
88 
89     xinv.f = ax;
90 
91     m = 0x03c00 - (xinv.i & 0x07c00);
92     xinv.i = xinv.i + m;
93     xinv.f = 1.41176471f16 - 0.47058824f16 * xinv.f;
94     xinv.i = xinv.i + m;
95 
96     b = 2.0f16 - xinv.f * ax;
97     xinv.f = xinv.f * b;
98 
99     b = 2.0f16 - xinv.f * ax;
100     xinv.f = xinv.f * b;
101 
102     b = 2.0f16 - xinv.f * ax;
103     xinv.f = xinv.f * b;
104 
105     b = 2.0f16 - xinv.f * ax;
106     xinv.f = xinv.f * b;
107 
108     xinv.f = vdupq_m(xinv.f, F16INFINITY, vcmpeqq(x, 0.0f));
109     /*
110      * restore sign
111      */
112     xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
113 
114     return xinv.f;
115 }
116 
vdiv_f16(f16x8_t num,f16x8_t den)117 __STATIC_INLINE f16x8_t vdiv_f16(
118     f16x8_t num, f16x8_t den)
119 {
120     return vmulq(num, vrecip_hiprec_f16(den));
121 }
122 
123 
124 /**
125   @brief         Single-precision taylor dev.
126   @param[in]     x              f16  vector input
127   @param[in]     coeffs         f16  vector coeffs
128   @return        destination    f16  vector
129  */
130 
vtaylor_polyq_f16(float16x8_t x,const float16_t * coeffs)131 __STATIC_INLINE float16x8_t vtaylor_polyq_f16(
132         float16x8_t           x,
133         const float16_t * coeffs)
134 {
135     float16x8_t         A = vfmasq(vdupq_n_f16(coeffs[4]), x, coeffs[0]);
136     float16x8_t         B = vfmasq(vdupq_n_f16(coeffs[6]), x, coeffs[2]);
137     float16x8_t         C = vfmasq(vdupq_n_f16(coeffs[5]), x, coeffs[1]);
138     float16x8_t         D = vfmasq(vdupq_n_f16(coeffs[7]), x, coeffs[3]);
139     float16x8_t         x2 = vmulq(x, x);
140     float16x8_t         x4 = vmulq(x2, x2);
141     float16x8_t         res = vfmaq(vfmaq_f16(A, B, x2), vfmaq_f16(C, D, x2), x4);
142 
143     return res;
144 }
145 
146 #define VMANT_EXP_F16(x)  \
147     any16x8_t       r;    \
148     int16x8_t       n;    \
149                           \
150     r.f = x;              \
151     n = r.i >> 10;        \
152     n = n - 15;           \
153     r.i = r.i - (n << 10);\
154                           \
155     vecExpUnBiased = n;   \
156     vecTmpFlt1 = r.f;
157 
vlogq_f16(float16x8_t vecIn)158 __STATIC_INLINE float16x8_t vlogq_f16(float16x8_t vecIn)
159 {
160     q15x8_t             vecExpUnBiased;
161     float16x8_t         vecTmpFlt0, vecTmpFlt1;
162     float16x8_t         vecAcc0, vecAcc1, vecAcc2, vecAcc3;
163     float16x8_t         vecExpUnBiasedFlt;
164 
165     /*
166      * extract exponent
167      */
168     VMANT_EXP_F16(vecIn);
169 
170     vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
171     /*
172      * a = (__logf_lut_f16[4] * r.f) + (__logf_lut_f16[0]);
173      */
174     vecAcc0 = vdupq_n_f16(__logf_lut_f16[0]);
175     vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f16[4]);
176     /*
177      * b = (__logf_lut_f16[6] * r.f) + (__logf_lut_f16[2]);
178      */
179     vecAcc1 = vdupq_n_f16(__logf_lut_f16[2]);
180     vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f16[6]);
181     /*
182      * c = (__logf_lut_f16[5] * r.f) + (__logf_lut_f16[1]);
183      */
184     vecAcc2 = vdupq_n_f16(__logf_lut_f16[1]);
185     vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f16[5]);
186     /*
187      * d = (__logf_lut_f16[7] * r.f) + (__logf_lut_f16[3]);
188      */
189     vecAcc3 = vdupq_n_f16(__logf_lut_f16[3]);
190     vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f16[7]);
191     /*
192      * a = a + b * xx;
193      */
194     vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
195     /*
196      * c = c + d * xx;
197      */
198     vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
199     /*
200      * xx = xx * xx;
201      */
202     vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
203     vecExpUnBiasedFlt = vcvtq_f16_s16(vecExpUnBiased);
204     /*
205      * r.f = a + c * xx;
206      */
207     vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
208     /*
209      * add exponent
210      * r.f = r.f + ((float32_t) m) * __logf_rng_f16;
211      */
212     vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f16);
213     // set log0 down to -inf
214     vecAcc0 = vdupq_m(vecAcc0, -F16INFINITY, vcmpeqq(vecIn, 0.0f));
215     return vecAcc0;
216 }
217 
vexpq_f16(float16x8_t x)218 __STATIC_INLINE float16x8_t vexpq_f16(
219     float16x8_t x)
220 {
221     // Perform range reduction [-log(2),log(2)]
222     int16x8_t       m = vcvtq_s16_f16(vmulq_n_f16(x, 1.4426950408f16));
223     float16x8_t     val = vfmsq_f16(x, vcvtq_f16_s16(m), vdupq_n_f16(0.6931471805f16));
224 
225     // Polynomial Approximation
226     float16x8_t         poly = vtaylor_polyq_f16(val, exp_tab_f16);
227 
228     // Reconstruct
229     poly = (float16x8_t) (vqaddq_s16((int16x8_t) (poly), vqshlq_n_s16(m, 10)));
230 
231     poly = vdupq_m(poly, 0.0f, vcmpltq_n_s16(m, -14));
232     return poly;
233 }
234 
arm_vec_exponent_f16(float16x8_t x,int16_t nb)235 __STATIC_INLINE float16x8_t arm_vec_exponent_f16(float16x8_t x, int16_t nb)
236 {
237     float16x8_t         r = x;
238     nb--;
239     while (nb > 0) {
240         r = vmulq(r, x);
241         nb--;
242     }
243     return (r);
244 }
245 
vpowq_f16(f16x8_t val,f16x8_t n)246 __STATIC_INLINE f16x8_t vpowq_f16(
247     f16x8_t val,
248     f16x8_t n)
249 {
250     return vexpq_f16(vmulq_f16(n, vlogq_f16(val)));
251 }
252 
253 #define INV_NEWTON_INIT_F16  0x7773
254 
vrecip_f16(f16x8_t vecIn)255 __STATIC_INLINE f16x8_t vrecip_f16(f16x8_t vecIn)
256 {
257     f16x8_t     vecSx, vecW, vecTmp;
258     any16x8_t   v;
259 
260     vecSx = vabsq(vecIn);
261 
262     v.f = vecIn;
263     v.i = vsubq(vdupq_n_s16(INV_NEWTON_INIT_F16), v.i);
264 
265     vecW = vmulq(vecSx, v.f);
266 
267     // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
268     vecTmp = vsubq(vdupq_n_f16(8.0f), vecW);
269     vecTmp = vfmasq(vecW, vecTmp, -28.0f);
270     vecTmp = vfmasq(vecW, vecTmp, 56.0f);
271     vecTmp = vfmasq(vecW, vecTmp, -70.0f);
272     vecTmp = vfmasq(vecW, vecTmp, 56.0f);
273     vecTmp = vfmasq(vecW, vecTmp, -28.0f);
274     vecTmp = vfmasq(vecW, vecTmp, 8.0f);
275     v.f = vmulq(v.f,  vecTmp);
276 
277     v.f = vdupq_m(v.f, F16INFINITY, vcmpeqq(vecIn, 0.0f));
278     /*
279      * restore sign
280      */
281     v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
282     return v.f;
283 }
284 
vtanhq_f16(f16x8_t val)285 __STATIC_INLINE f16x8_t vtanhq_f16(
286     f16x8_t val)
287 {
288     f16x8_t         x =
289         vminnmq_f16(vmaxnmq_f16(val, vdupq_n_f16(-10.f)), vdupq_n_f16(10.0f));
290     f16x8_t         exp2x = vexpq_f16(vmulq_n_f16(x, 2.f));
291     f16x8_t         num = vsubq_n_f16(exp2x, 1.f);
292     f16x8_t         den = vaddq_n_f16(exp2x, 1.f);
293     f16x8_t         tanh = vmulq_f16(num, vrecip_f16(den));
294     return tanh;
295 }
296 
297 #endif /* defined(ARM_MATH_MVE_FLOAT16)  && !defined(ARM_MATH_AUTOVECTORIZE)*/
298 
299 
300 
301 #ifdef   __cplusplus
302 }
303 #endif
304 
305 #endif /* ARM FLOAT16 SUPPORTED */
306 
307 #endif /* _ARM_VEC_MATH_F16_H */
308 
309 /**
310  *
311  * End of file.
312  */
313