1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_mag_f32.c
4  * Description:  Floating-point complex magnitude
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions.h"
30 
31 /**
32   @ingroup groupCmplxMath
33  */
34 
35 /**
36   @defgroup cmplx_mag Complex Magnitude
37 
38   Computes the magnitude of the elements of a complex data vector.
39 
40   The <code>pSrc</code> points to the source data and
41   <code>pDst</code> points to the where the result should be written.
42   <code>numSamples</code> specifies the number of complex samples
43   in the input array and the data is stored in an interleaved fashion
44   (real, imag, real, imag, ...).
45   The input array has a total of <code>2*numSamples</code> values;
46   the output array has a total of <code>numSamples</code> values.
47 
48   The underlying algorithm is used:
49 
50   <pre>
51   for (n = 0; n < numSamples; n++) {
52       pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
53   }
54   </pre>
55 
56   There are separate functions for floating-point, Q15, and Q31 data types.
57  */
58 
59 /**
60   @addtogroup cmplx_mag
61   @{
62  */
63 
64 /**
65   @brief         Floating-point complex magnitude.
66   @param[in]     pSrc        points to input vector
67   @param[out]    pDst        points to output vector
68   @param[in]     numSamples  number of samples in each vector
69  */
70 
71 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
72 #include "arm_vec_math.h"
73 #endif
74 
75 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
76 
77 #include "arm_helium_utils.h"
78 
79 
arm_cmplx_mag_f32(const float32_t * pSrc,float32_t * pDst,uint32_t numSamples)80 ARM_DSP_ATTRIBUTE void arm_cmplx_mag_f32(
81   const float32_t * pSrc,
82         float32_t * pDst,
83         uint32_t numSamples)
84 {
85     int32_t blockSize = numSamples;  /* loop counters */
86     uint32_t  blkCnt;           /* loop counters */
87     f32x4x2_t vecSrc;
88     f32x4_t sum;
89     float32_t real, imag;                      /* Temporary variables to hold input values */
90 
91     /* Compute 4 complex samples at a time */
92     blkCnt = blockSize >> 2;
93     while (blkCnt > 0U)
94     {
95         q31x4_t newtonStartVec;
96         f32x4_t sumHalf, invSqrt;
97 
98         vecSrc = vld2q(pSrc);
99         pSrc += 8;
100         sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
101         sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
102 
103         /*
104          * inlined Fast SQRT using inverse SQRT newton-raphson method
105          */
106 
107         /* compute initial value */
108         newtonStartVec = vdupq_n_s32(INVSQRT_MAGIC_F32) - vshrq((q31x4_t) sum, 1);
109         sumHalf = sum * 0.5f;
110         /*
111          * compute 3 x iterations
112          *
113          * The more iterations, the more accuracy.
114          * If you need to trade a bit of accuracy for more performance,
115          * you can comment out the 3rd use of the macro.
116          */
117         INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, (f32x4_t) newtonStartVec);
118         INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
119         INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
120         /*
121          * set negative values to 0
122          */
123         invSqrt = vdupq_m(invSqrt, 0.0f, vcmpltq(invSqrt, 0.0f));
124         /*
125          * sqrt(x) = x * invSqrt(x)
126          */
127         sum = vmulq(sum, invSqrt);
128         vst1q(pDst, sum);
129         pDst += 4;
130         /*
131          * Decrement the blockSize loop counter
132          */
133         blkCnt--;
134     }
135     /*
136      * tail
137      */
138     blkCnt = blockSize & 3;
139     while (blkCnt > 0U)
140     {
141       /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
142 
143       real = *pSrc++;
144       imag = *pSrc++;
145 
146       /* store result in destination buffer. */
147       arm_sqrt_f32((real * real) + (imag * imag), pDst++);
148 
149       /* Decrement loop counter */
150       blkCnt--;
151     }
152 }
153 
154 #else
arm_cmplx_mag_f32(const float32_t * pSrc,float32_t * pDst,uint32_t numSamples)155 ARM_DSP_ATTRIBUTE void arm_cmplx_mag_f32(
156   const float32_t * pSrc,
157         float32_t * pDst,
158         uint32_t numSamples)
159 {
160   uint32_t blkCnt;                               /* loop counter */
161   float32_t real, imag;                      /* Temporary variables to hold input values */
162 
163 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
164 
165   float32x4x2_t vecA;
166   float32x4_t vRealA;
167   float32x4_t vImagA;
168   float32x4_t vMagSqA;
169 
170   float32x4x2_t vecB;
171   float32x4_t vRealB;
172   float32x4_t vImagB;
173   float32x4_t vMagSqB;
174 
175   /* Loop unrolling: Compute 8 outputs at a time */
176   blkCnt = numSamples >> 3;
177 
178   while (blkCnt > 0U)
179   {
180     /* out = sqrt((real * real) + (imag * imag)) */
181 
182     vecA = vld2q_f32(pSrc);
183     pSrc += 8;
184 
185     vecB = vld2q_f32(pSrc);
186     pSrc += 8;
187 
188     vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
189     vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
190     vMagSqA = vaddq_f32(vRealA, vImagA);
191 
192     vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
193     vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
194     vMagSqB = vaddq_f32(vRealB, vImagB);
195 
196     /* Store the result in the destination buffer. */
197     vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqA));
198     pDst += 4;
199 
200     vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqB));
201     pDst += 4;
202 
203     /* Decrement the loop counter */
204     blkCnt--;
205   }
206 
207   blkCnt = numSamples & 7;
208 
209 #else
210 
211 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
212 
213   /* Loop unrolling: Compute 4 outputs at a time */
214   blkCnt = numSamples >> 2U;
215 
216   while (blkCnt > 0U)
217   {
218     /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
219 
220     real = *pSrc++;
221     imag = *pSrc++;
222 
223     /* store result in destination buffer. */
224     arm_sqrt_f32((real * real) + (imag * imag), pDst++);
225 
226     real = *pSrc++;
227     imag = *pSrc++;
228     arm_sqrt_f32((real * real) + (imag * imag), pDst++);
229 
230     real = *pSrc++;
231     imag = *pSrc++;
232     arm_sqrt_f32((real * real) + (imag * imag), pDst++);
233 
234     real = *pSrc++;
235     imag = *pSrc++;
236     arm_sqrt_f32((real * real) + (imag * imag), pDst++);
237 
238     /* Decrement loop counter */
239     blkCnt--;
240   }
241 
242   /* Loop unrolling: Compute remaining outputs */
243   blkCnt = numSamples % 0x4U;
244 
245 #else
246 
247   /* Initialize blkCnt with number of samples */
248   blkCnt = numSamples;
249 
250 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
251 #endif /* #if defined(ARM_MATH_NEON) */
252 
253   while (blkCnt > 0U)
254   {
255     /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
256 
257     real = *pSrc++;
258     imag = *pSrc++;
259 
260     /* store result in destination buffer. */
261     arm_sqrt_f32((real * real) + (imag * imag), pDst++);
262 
263     /* Decrement loop counter */
264     blkCnt--;
265   }
266 
267 }
268 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
269 
270 /**
271   @} end of cmplx_mag group
272  */
273