1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_mag_squared_f32.c
4  * Description:  Floating-point complex magnitude squared
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions.h"
30 
31 /**
32   @ingroup groupCmplxMath
33  */
34 
35 /**
36   @defgroup cmplx_mag_squared Complex Magnitude Squared
37 
38   Computes the magnitude squared of the elements of a complex data vector.
39 
40   The <code>pSrc</code> points to the source data and
41   <code>pDst</code> points to the where the result should be written.
42   <code>numSamples</code> specifies the number of complex samples
43   in the input array and the data is stored in an interleaved fashion
44   (real, imag, real, imag, ...).
45   The input array has a total of <code>2*numSamples</code> values;
46   the output array has a total of <code>numSamples</code> values.
47 
48   The underlying algorithm is used:
49 
50   <pre>
51   for (n = 0; n < numSamples; n++) {
52       pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2;
53   }
54   </pre>
55 
56   There are separate functions for floating-point, Q15, and Q31 data types.
57  */
58 
59 /**
60   @addtogroup cmplx_mag_squared
61   @{
62  */
63 
64 /**
65   @brief         Floating-point complex magnitude squared.
66   @param[in]     pSrc        points to input vector
67   @param[out]    pDst        points to output vector
68   @param[in]     numSamples  number of samples in each vector
69  */
70 
71 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
72 
arm_cmplx_mag_squared_f32(const float32_t * pSrc,float32_t * pDst,uint32_t numSamples)73 ARM_DSP_ATTRIBUTE void arm_cmplx_mag_squared_f32(
74   const float32_t * pSrc,
75         float32_t * pDst,
76         uint32_t numSamples)
77 {
78     int32_t blockSize = numSamples;  /* loop counters */
79     uint32_t  blkCnt;           /* loop counters */
80     f32x4x2_t vecSrc;
81     f32x4_t sum;
82     float32_t real, imag;                          /* Temporary input variables */
83 
84     /* Compute 4 complex samples at a time */
85     blkCnt = blockSize >> 2;
86     while (blkCnt > 0U)
87     {
88         vecSrc = vld2q(pSrc);
89         sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
90         sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
91         vst1q(pDst, sum);
92 
93         pSrc += 8;
94         pDst += 4;
95 
96         /*
97          * Decrement the blockSize loop counter
98          */
99         blkCnt--;
100     }
101 
102     /* Tail */
103     blkCnt = blockSize & 3;
104     while (blkCnt > 0U)
105     {
106       /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
107 
108       real = *pSrc++;
109       imag = *pSrc++;
110 
111       /* store result in destination buffer. */
112       *pDst++ = (real * real) + (imag * imag);
113 
114       /* Decrement loop counter */
115       blkCnt--;
116     }
117 
118 }
119 
120 #else
arm_cmplx_mag_squared_f32(const float32_t * pSrc,float32_t * pDst,uint32_t numSamples)121 ARM_DSP_ATTRIBUTE void arm_cmplx_mag_squared_f32(
122   const float32_t * pSrc,
123         float32_t * pDst,
124         uint32_t numSamples)
125 {
126         uint32_t blkCnt;                               /* Loop counter */
127         float32_t real, imag;                          /* Temporary input variables */
128 
129 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
130   float32x4x2_t vecA;
131   float32x4_t vRealA;
132   float32x4_t vImagA;
133   float32x4_t vMagSqA;
134 
135   float32x4x2_t vecB;
136   float32x4_t vRealB;
137   float32x4_t vImagB;
138   float32x4_t vMagSqB;
139 
140   /* Loop unrolling: Compute 8 outputs at a time */
141   blkCnt = numSamples >> 3;
142 
143   while (blkCnt > 0U)
144   {
145     /* out = sqrt((real * real) + (imag * imag)) */
146 
147     vecA = vld2q_f32(pSrc);
148     pSrc += 8;
149 
150     vRealA = vmulq_f32(vecA.val[0], vecA.val[0]);
151     vImagA = vmulq_f32(vecA.val[1], vecA.val[1]);
152     vMagSqA = vaddq_f32(vRealA, vImagA);
153 
154     vecB = vld2q_f32(pSrc);
155     pSrc += 8;
156 
157     vRealB = vmulq_f32(vecB.val[0], vecB.val[0]);
158     vImagB = vmulq_f32(vecB.val[1], vecB.val[1]);
159     vMagSqB = vaddq_f32(vRealB, vImagB);
160 
161     /* Store the result in the destination buffer. */
162     vst1q_f32(pDst, vMagSqA);
163     pDst += 4;
164 
165     vst1q_f32(pDst, vMagSqB);
166     pDst += 4;
167 
168     /* Decrement the loop counter */
169     blkCnt--;
170   }
171 
172   blkCnt = numSamples & 7;
173 
174 #else
175 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
176 
177   /* Loop unrolling: Compute 4 outputs at a time */
178   blkCnt = numSamples >> 2U;
179 
180   while (blkCnt > 0U)
181   {
182     /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
183 
184     real = *pSrc++;
185     imag = *pSrc++;
186     *pDst++ = (real * real) + (imag * imag);
187 
188     real = *pSrc++;
189     imag = *pSrc++;
190     *pDst++ = (real * real) + (imag * imag);
191 
192     real = *pSrc++;
193     imag = *pSrc++;
194     *pDst++ = (real * real) + (imag * imag);
195 
196     real = *pSrc++;
197     imag = *pSrc++;
198     *pDst++ = (real * real) + (imag * imag);
199 
200     /* Decrement loop counter */
201     blkCnt--;
202   }
203 
204   /* Loop unrolling: Compute remaining outputs */
205   blkCnt = numSamples % 0x4U;
206 
207 #else
208 
209   /* Initialize blkCnt with number of samples */
210   blkCnt = numSamples;
211 
212 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
213 #endif /* #if defined(ARM_MATH_NEON) */
214 
215   while (blkCnt > 0U)
216   {
217     /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
218 
219     real = *pSrc++;
220     imag = *pSrc++;
221 
222     /* store result in destination buffer. */
223     *pDst++ = (real * real) + (imag * imag);
224 
225     /* Decrement loop counter */
226     blkCnt--;
227   }
228 
229 }
230 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
231 
232 /**
233   @} end of cmplx_mag_squared group
234  */
235