1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mag_f32.c
4 * Description: Floating-point complex magnitude
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions.h"
30
31 /**
32 @ingroup groupCmplxMath
33 */
34
35 /**
36 @defgroup cmplx_mag Complex Magnitude
37
38 Computes the magnitude of the elements of a complex data vector.
39
40 The <code>pSrc</code> points to the source data and
41 <code>pDst</code> points to the where the result should be written.
42 <code>numSamples</code> specifies the number of complex samples
43 in the input array and the data is stored in an interleaved fashion
44 (real, imag, real, imag, ...).
45 The input array has a total of <code>2*numSamples</code> values;
46 the output array has a total of <code>numSamples</code> values.
47
48 The underlying algorithm is used:
49
50 <pre>
51 for (n = 0; n < numSamples; n++) {
52 pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
53 }
54 </pre>
55
56 There are separate functions for floating-point, Q15, and Q31 data types.
57 */
58
59 /**
60 @addtogroup cmplx_mag
61 @{
62 */
63
64 /**
65 @brief Floating-point complex magnitude.
66 @param[in] pSrc points to input vector
67 @param[out] pDst points to output vector
68 @param[in] numSamples number of samples in each vector
69 */
70
71 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
72 #include "arm_vec_math.h"
73 #endif
74
75 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
76
77 #include "arm_helium_utils.h"
78
79
arm_cmplx_mag_f32(const float32_t * pSrc,float32_t * pDst,uint32_t numSamples)80 ARM_DSP_ATTRIBUTE void arm_cmplx_mag_f32(
81 const float32_t * pSrc,
82 float32_t * pDst,
83 uint32_t numSamples)
84 {
85 int32_t blockSize = numSamples; /* loop counters */
86 uint32_t blkCnt; /* loop counters */
87 f32x4x2_t vecSrc;
88 f32x4_t sum;
89 float32_t real, imag; /* Temporary variables to hold input values */
90
91 /* Compute 4 complex samples at a time */
92 blkCnt = blockSize >> 2;
93 while (blkCnt > 0U)
94 {
95 q31x4_t newtonStartVec;
96 f32x4_t sumHalf, invSqrt;
97
98 vecSrc = vld2q(pSrc);
99 pSrc += 8;
100 sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
101 sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
102
103 /*
104 * inlined Fast SQRT using inverse SQRT newton-raphson method
105 */
106
107 /* compute initial value */
108 newtonStartVec = vdupq_n_s32(INVSQRT_MAGIC_F32) - vshrq((q31x4_t) sum, 1);
109 sumHalf = sum * 0.5f;
110 /*
111 * compute 3 x iterations
112 *
113 * The more iterations, the more accuracy.
114 * If you need to trade a bit of accuracy for more performance,
115 * you can comment out the 3rd use of the macro.
116 */
117 INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, (f32x4_t) newtonStartVec);
118 INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
119 INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
120 /*
121 * set negative values to 0
122 */
123 invSqrt = vdupq_m(invSqrt, 0.0f, vcmpltq(invSqrt, 0.0f));
124 /*
125 * sqrt(x) = x * invSqrt(x)
126 */
127 sum = vmulq(sum, invSqrt);
128 vst1q(pDst, sum);
129 pDst += 4;
130 /*
131 * Decrement the blockSize loop counter
132 */
133 blkCnt--;
134 }
135 /*
136 * tail
137 */
138 blkCnt = blockSize & 3;
139 while (blkCnt > 0U)
140 {
141 /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
142
143 real = *pSrc++;
144 imag = *pSrc++;
145
146 /* store result in destination buffer. */
147 arm_sqrt_f32((real * real) + (imag * imag), pDst++);
148
149 /* Decrement loop counter */
150 blkCnt--;
151 }
152 }
153
154 #else
arm_cmplx_mag_f32(const float32_t * pSrc,float32_t * pDst,uint32_t numSamples)155 ARM_DSP_ATTRIBUTE void arm_cmplx_mag_f32(
156 const float32_t * pSrc,
157 float32_t * pDst,
158 uint32_t numSamples)
159 {
160 uint32_t blkCnt; /* loop counter */
161 float32_t real, imag; /* Temporary variables to hold input values */
162
163 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
164
165 float32x4x2_t vecA;
166 float32x4_t vRealA;
167 float32x4_t vImagA;
168 float32x4_t vMagSqA;
169
170 float32x4x2_t vecB;
171 float32x4_t vRealB;
172 float32x4_t vImagB;
173 float32x4_t vMagSqB;
174
175 /* Loop unrolling: Compute 8 outputs at a time */
176 blkCnt = numSamples >> 3;
177
178 while (blkCnt > 0U)
179 {
180 /* out = sqrt((real * real) + (imag * imag)) */
181
182 vecA = vld2q_f32(pSrc);
183 pSrc += 8;
184
185 vecB = vld2q_f32(pSrc);
186 pSrc += 8;
187
188 vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
189 vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
190 vMagSqA = vaddq_f32(vRealA, vImagA);
191
192 vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
193 vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
194 vMagSqB = vaddq_f32(vRealB, vImagB);
195
196 /* Store the result in the destination buffer. */
197 vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqA));
198 pDst += 4;
199
200 vst1q_f32(pDst, __arm_vec_sqrt_f32_neon(vMagSqB));
201 pDst += 4;
202
203 /* Decrement the loop counter */
204 blkCnt--;
205 }
206
207 blkCnt = numSamples & 7;
208
209 #else
210
211 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
212
213 /* Loop unrolling: Compute 4 outputs at a time */
214 blkCnt = numSamples >> 2U;
215
216 while (blkCnt > 0U)
217 {
218 /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
219
220 real = *pSrc++;
221 imag = *pSrc++;
222
223 /* store result in destination buffer. */
224 arm_sqrt_f32((real * real) + (imag * imag), pDst++);
225
226 real = *pSrc++;
227 imag = *pSrc++;
228 arm_sqrt_f32((real * real) + (imag * imag), pDst++);
229
230 real = *pSrc++;
231 imag = *pSrc++;
232 arm_sqrt_f32((real * real) + (imag * imag), pDst++);
233
234 real = *pSrc++;
235 imag = *pSrc++;
236 arm_sqrt_f32((real * real) + (imag * imag), pDst++);
237
238 /* Decrement loop counter */
239 blkCnt--;
240 }
241
242 /* Loop unrolling: Compute remaining outputs */
243 blkCnt = numSamples % 0x4U;
244
245 #else
246
247 /* Initialize blkCnt with number of samples */
248 blkCnt = numSamples;
249
250 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
251 #endif /* #if defined(ARM_MATH_NEON) */
252
253 while (blkCnt > 0U)
254 {
255 /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
256
257 real = *pSrc++;
258 imag = *pSrc++;
259
260 /* store result in destination buffer. */
261 arm_sqrt_f32((real * real) + (imag * imag), pDst++);
262
263 /* Decrement loop counter */
264 blkCnt--;
265 }
266
267 }
268 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
269
270 /**
271 @} end of cmplx_mag group
272 */
273