1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_mag_f16.c
4  * Description:  Floating-point complex magnitude
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 /**
33   @ingroup groupCmplxMath
34  */
35 
36 
37 
38 /**
39   @addtogroup cmplx_mag
40   @{
41  */
42 
43 /**
44   @brief         Floating-point complex magnitude.
45   @param[in]     pSrc        points to input vector
46   @param[out]    pDst        points to output vector
47   @param[in]     numSamples  number of samples in each vector
48  */
49 
50 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
51 
52 #include "arm_helium_utils.h"
53 
54 
arm_cmplx_mag_f16(const float16_t * pSrc,float16_t * pDst,uint32_t numSamples)55 void arm_cmplx_mag_f16(
56   const float16_t * pSrc,
57         float16_t * pDst,
58         uint32_t numSamples)
59 {
60     int32_t blockSize = numSamples;  /* loop counters */
61     uint32_t  blkCnt;           /* loop counters */
62     f16x8x2_t vecSrc;
63     f16x8_t sum;
64 
65     /* Compute 4 complex samples at a time */
66     blkCnt = blockSize >> 3;
67     while (blkCnt > 0U)
68     {
69         q15x8_t newtonStartVec;
70         f16x8_t sumHalf, invSqrt;
71 
72         vecSrc = vld2q(pSrc);
73         pSrc += 16;
74         sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
75         sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
76 
77         /*
78          * inlined Fast SQRT using inverse SQRT newton-raphson method
79          */
80 
81         /* compute initial value */
82         newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
83         sumHalf = sum * 0.5f;
84         /*
85          * compute 3 x iterations
86          *
87          * The more iterations, the more accuracy.
88          * If you need to trade a bit of accuracy for more performance,
89          * you can comment out the 3rd use of the macro.
90          */
91         INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
92         INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
93         INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
94         /*
95          * set negative values to 0
96          */
97         invSqrt = vdupq_m(invSqrt, (float16_t)0.0f, vcmpltq(invSqrt, (float16_t)0.0f));
98         /*
99          * sqrt(x) = x * invSqrt(x)
100          */
101         sum = vmulq(sum, invSqrt);
102         vstrhq_f16(pDst, sum);
103         pDst += 8;
104         /*
105          * Decrement the blockSize loop counter
106          */
107         blkCnt--;
108     }
109     /*
110      * tail
111      */
112     blkCnt = blockSize & 7;
113     if (blkCnt > 0U)
114     {
115         mve_pred16_t p0 = vctp16q(blkCnt);
116         q15x8_t newtonStartVec;
117         f16x8_t sumHalf, invSqrt;
118 
119         vecSrc = vld2q((float16_t const *)pSrc);
120         sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
121         sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
122 
123         /*
124          * inlined Fast SQRT using inverse SQRT newton-raphson method
125          */
126 
127         /* compute initial value */
128         newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
129         sumHalf = vmulq(sum, (float16_t)0.5);
130         /*
131          * compute 2 x iterations
132          */
133         INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
134         INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
135         /*
136          * set negative values to 0
137          */
138         invSqrt = vdupq_m(invSqrt, (float16_t)0.0, vcmpltq(invSqrt, (float16_t)0.0));
139         /*
140          * sqrt(x) = x * invSqrt(x)
141          */
142         sum = vmulq(sum, invSqrt);
143         vstrhq_p_f16(pDst, sum, p0);
144     }
145 }
146 
147 #else
arm_cmplx_mag_f16(const float16_t * pSrc,float16_t * pDst,uint32_t numSamples)148 void arm_cmplx_mag_f16(
149   const float16_t * pSrc,
150         float16_t * pDst,
151         uint32_t numSamples)
152 {
153   uint32_t blkCnt;                               /* loop counter */
154   _Float16 real, imag;                      /* Temporary variables to hold input values */
155 
156 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
157 
158   /* Loop unrolling: Compute 4 outputs at a time */
159   blkCnt = numSamples >> 2U;
160 
161   while (blkCnt > 0U)
162   {
163     /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
164 
165     real = *pSrc++;
166     imag = *pSrc++;
167 
168     /* store result in destination buffer. */
169     arm_sqrt_f16((real * real) + (imag * imag), pDst++);
170 
171     real = *pSrc++;
172     imag = *pSrc++;
173     arm_sqrt_f16((real * real) + (imag * imag), pDst++);
174 
175     real = *pSrc++;
176     imag = *pSrc++;
177     arm_sqrt_f16((real * real) + (imag * imag), pDst++);
178 
179     real = *pSrc++;
180     imag = *pSrc++;
181     arm_sqrt_f16((real * real) + (imag * imag), pDst++);
182 
183     /* Decrement loop counter */
184     blkCnt--;
185   }
186 
187   /* Loop unrolling: Compute remaining outputs */
188   blkCnt = numSamples % 0x4U;
189 
190 #else
191 
192   /* Initialize blkCnt with number of samples */
193   blkCnt = numSamples;
194 
195 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
196 
197   while (blkCnt > 0U)
198   {
199     /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
200 
201     real = *pSrc++;
202     imag = *pSrc++;
203 
204     /* store result in destination buffer. */
205     arm_sqrt_f16((real * real) + (imag * imag), pDst++);
206 
207     /* Decrement loop counter */
208     blkCnt--;
209   }
210 
211 }
212 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
213 
214 /**
215   @} end of cmplx_mag group
216  */
217 
218 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
219