1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_mag_f16.c
4  * Description:  Floating-point complex magnitude
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 /**
33   @ingroup groupCmplxMath
34  */
35 
36 /**
37   @defgroup cmplx_mag Complex Magnitude
38 
39   Computes the magnitude of the elements of a complex data vector.
40 
41   The <code>pSrc</code> points to the source data and
42   <code>pDst</code> points to the where the result should be written.
43   <code>numSamples</code> specifies the number of complex samples
44   in the input array and the data is stored in an interleaved fashion
45   (real, imag, real, imag, ...).
46   The input array has a total of <code>2*numSamples</code> values;
47   the output array has a total of <code>numSamples</code> values.
48 
49   The underlying algorithm is used:
50 
51   <pre>
52   for (n = 0; n < numSamples; n++) {
53       pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
54   }
55   </pre>
56 
57   There are separate functions for floating-point, Q15, and Q31 data types.
58  */
59 
60 /**
61   @addtogroup cmplx_mag
62   @{
63  */
64 
65 /**
66   @brief         Floating-point complex magnitude.
67   @param[in]     pSrc        points to input vector
68   @param[out]    pDst        points to output vector
69   @param[in]     numSamples  number of samples in each vector
70   @return        none
71  */
72 
73 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
74 
75 #include "arm_helium_utils.h"
76 
77 
arm_cmplx_mag_f16(const float16_t * pSrc,float16_t * pDst,uint32_t numSamples)78 void arm_cmplx_mag_f16(
79   const float16_t * pSrc,
80         float16_t * pDst,
81         uint32_t numSamples)
82 {
83     int32_t blockSize = numSamples;  /* loop counters */
84     uint32_t  blkCnt;           /* loop counters */
85     f16x8x2_t vecSrc;
86     f16x8_t sum;
87 
88     /* Compute 4 complex samples at a time */
89     blkCnt = blockSize >> 3;
90     while (blkCnt > 0U)
91     {
92         q15x8_t newtonStartVec;
93         f16x8_t sumHalf, invSqrt;
94 
95         vecSrc = vld2q(pSrc);
96         pSrc += 16;
97         sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
98         sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
99 
100         /*
101          * inlined Fast SQRT using inverse SQRT newton-raphson method
102          */
103 
104         /* compute initial value */
105         newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
106         sumHalf = sum * 0.5f;
107         /*
108          * compute 3 x iterations
109          *
110          * The more iterations, the more accuracy.
111          * If you need to trade a bit of accuracy for more performance,
112          * you can comment out the 3rd use of the macro.
113          */
114         INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
115         INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
116         INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
117         /*
118          * set negative values to 0
119          */
120         invSqrt = vdupq_m(invSqrt, (float16_t)0.0f, vcmpltq(invSqrt, (float16_t)0.0f));
121         /*
122          * sqrt(x) = x * invSqrt(x)
123          */
124         sum = vmulq(sum, invSqrt);
125         vstrhq_f16(pDst, sum);
126         pDst += 8;
127         /*
128          * Decrement the blockSize loop counter
129          */
130         blkCnt--;
131     }
132     /*
133      * tail
134      */
135     blkCnt = blockSize & 7;
136     if (blkCnt > 0U)
137     {
138         mve_pred16_t p0 = vctp16q(blkCnt);
139         q15x8_t newtonStartVec;
140         f16x8_t sumHalf, invSqrt;
141 
142         vecSrc = vld2q((float16_t const *)pSrc);
143         sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
144         sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
145 
146         /*
147          * inlined Fast SQRT using inverse SQRT newton-raphson method
148          */
149 
150         /* compute initial value */
151         newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
152         sumHalf = vmulq(sum, (float16_t)0.5);
153         /*
154          * compute 2 x iterations
155          */
156         INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
157         INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
158         /*
159          * set negative values to 0
160          */
161         invSqrt = vdupq_m(invSqrt, (float16_t)0.0, vcmpltq(invSqrt, (float16_t)0.0));
162         /*
163          * sqrt(x) = x * invSqrt(x)
164          */
165         sum = vmulq(sum, invSqrt);
166         vstrhq_p_f16(pDst, sum, p0);
167     }
168 }
169 
170 #else
arm_cmplx_mag_f16(const float16_t * pSrc,float16_t * pDst,uint32_t numSamples)171 void arm_cmplx_mag_f16(
172   const float16_t * pSrc,
173         float16_t * pDst,
174         uint32_t numSamples)
175 {
176   uint32_t blkCnt;                               /* loop counter */
177   _Float16 real, imag;                      /* Temporary variables to hold input values */
178 
179 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
180 
181   /* Loop unrolling: Compute 4 outputs at a time */
182   blkCnt = numSamples >> 2U;
183 
184   while (blkCnt > 0U)
185   {
186     /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
187 
188     real = *pSrc++;
189     imag = *pSrc++;
190 
191     /* store result in destination buffer. */
192     arm_sqrt_f16((real * real) + (imag * imag), pDst++);
193 
194     real = *pSrc++;
195     imag = *pSrc++;
196     arm_sqrt_f16((real * real) + (imag * imag), pDst++);
197 
198     real = *pSrc++;
199     imag = *pSrc++;
200     arm_sqrt_f16((real * real) + (imag * imag), pDst++);
201 
202     real = *pSrc++;
203     imag = *pSrc++;
204     arm_sqrt_f16((real * real) + (imag * imag), pDst++);
205 
206     /* Decrement loop counter */
207     blkCnt--;
208   }
209 
210   /* Loop unrolling: Compute remaining outputs */
211   blkCnt = numSamples % 0x4U;
212 
213 #else
214 
215   /* Initialize blkCnt with number of samples */
216   blkCnt = numSamples;
217 
218 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
219 
220   while (blkCnt > 0U)
221   {
222     /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
223 
224     real = *pSrc++;
225     imag = *pSrc++;
226 
227     /* store result in destination buffer. */
228     arm_sqrt_f16((real * real) + (imag * imag), pDst++);
229 
230     /* Decrement loop counter */
231     blkCnt--;
232   }
233 
234 }
235 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
236 
237 /**
238   @} end of cmplx_mag group
239  */
240 
241 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */