1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mag_f16.c
4 * Description: Floating-point complex magnitude
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 /**
33 @ingroup groupCmplxMath
34 */
35
36 /**
37 @defgroup cmplx_mag Complex Magnitude
38
39 Computes the magnitude of the elements of a complex data vector.
40
41 The <code>pSrc</code> points to the source data and
42 <code>pDst</code> points to the where the result should be written.
43 <code>numSamples</code> specifies the number of complex samples
44 in the input array and the data is stored in an interleaved fashion
45 (real, imag, real, imag, ...).
46 The input array has a total of <code>2*numSamples</code> values;
47 the output array has a total of <code>numSamples</code> values.
48
49 The underlying algorithm is used:
50
51 <pre>
52 for (n = 0; n < numSamples; n++) {
53 pDst[n] = sqrt(pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2);
54 }
55 </pre>
56
57 There are separate functions for floating-point, Q15, and Q31 data types.
58 */
59
60 /**
61 @addtogroup cmplx_mag
62 @{
63 */
64
65 /**
66 @brief Floating-point complex magnitude.
67 @param[in] pSrc points to input vector
68 @param[out] pDst points to output vector
69 @param[in] numSamples number of samples in each vector
70 @return none
71 */
72
73 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
74
75 #include "arm_helium_utils.h"
76
77
arm_cmplx_mag_f16(const float16_t * pSrc,float16_t * pDst,uint32_t numSamples)78 void arm_cmplx_mag_f16(
79 const float16_t * pSrc,
80 float16_t * pDst,
81 uint32_t numSamples)
82 {
83 int32_t blockSize = numSamples; /* loop counters */
84 uint32_t blkCnt; /* loop counters */
85 f16x8x2_t vecSrc;
86 f16x8_t sum;
87
88 /* Compute 4 complex samples at a time */
89 blkCnt = blockSize >> 3;
90 while (blkCnt > 0U)
91 {
92 q15x8_t newtonStartVec;
93 f16x8_t sumHalf, invSqrt;
94
95 vecSrc = vld2q(pSrc);
96 pSrc += 16;
97 sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
98 sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
99
100 /*
101 * inlined Fast SQRT using inverse SQRT newton-raphson method
102 */
103
104 /* compute initial value */
105 newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
106 sumHalf = sum * 0.5f;
107 /*
108 * compute 3 x iterations
109 *
110 * The more iterations, the more accuracy.
111 * If you need to trade a bit of accuracy for more performance,
112 * you can comment out the 3rd use of the macro.
113 */
114 INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
115 INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
116 INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
117 /*
118 * set negative values to 0
119 */
120 invSqrt = vdupq_m(invSqrt, (float16_t)0.0f, vcmpltq(invSqrt, (float16_t)0.0f));
121 /*
122 * sqrt(x) = x * invSqrt(x)
123 */
124 sum = vmulq(sum, invSqrt);
125 vstrhq_f16(pDst, sum);
126 pDst += 8;
127 /*
128 * Decrement the blockSize loop counter
129 */
130 blkCnt--;
131 }
132 /*
133 * tail
134 */
135 blkCnt = blockSize & 7;
136 if (blkCnt > 0U)
137 {
138 mve_pred16_t p0 = vctp16q(blkCnt);
139 q15x8_t newtonStartVec;
140 f16x8_t sumHalf, invSqrt;
141
142 vecSrc = vld2q((float16_t const *)pSrc);
143 sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
144 sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
145
146 /*
147 * inlined Fast SQRT using inverse SQRT newton-raphson method
148 */
149
150 /* compute initial value */
151 newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
152 sumHalf = vmulq(sum, (float16_t)0.5);
153 /*
154 * compute 2 x iterations
155 */
156 INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
157 INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
158 /*
159 * set negative values to 0
160 */
161 invSqrt = vdupq_m(invSqrt, (float16_t)0.0, vcmpltq(invSqrt, (float16_t)0.0));
162 /*
163 * sqrt(x) = x * invSqrt(x)
164 */
165 sum = vmulq(sum, invSqrt);
166 vstrhq_p_f16(pDst, sum, p0);
167 }
168 }
169
170 #else
arm_cmplx_mag_f16(const float16_t * pSrc,float16_t * pDst,uint32_t numSamples)171 void arm_cmplx_mag_f16(
172 const float16_t * pSrc,
173 float16_t * pDst,
174 uint32_t numSamples)
175 {
176 uint32_t blkCnt; /* loop counter */
177 _Float16 real, imag; /* Temporary variables to hold input values */
178
179 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
180
181 /* Loop unrolling: Compute 4 outputs at a time */
182 blkCnt = numSamples >> 2U;
183
184 while (blkCnt > 0U)
185 {
186 /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
187
188 real = *pSrc++;
189 imag = *pSrc++;
190
191 /* store result in destination buffer. */
192 arm_sqrt_f16((real * real) + (imag * imag), pDst++);
193
194 real = *pSrc++;
195 imag = *pSrc++;
196 arm_sqrt_f16((real * real) + (imag * imag), pDst++);
197
198 real = *pSrc++;
199 imag = *pSrc++;
200 arm_sqrt_f16((real * real) + (imag * imag), pDst++);
201
202 real = *pSrc++;
203 imag = *pSrc++;
204 arm_sqrt_f16((real * real) + (imag * imag), pDst++);
205
206 /* Decrement loop counter */
207 blkCnt--;
208 }
209
210 /* Loop unrolling: Compute remaining outputs */
211 blkCnt = numSamples % 0x4U;
212
213 #else
214
215 /* Initialize blkCnt with number of samples */
216 blkCnt = numSamples;
217
218 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
219
220 while (blkCnt > 0U)
221 {
222 /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
223
224 real = *pSrc++;
225 imag = *pSrc++;
226
227 /* store result in destination buffer. */
228 arm_sqrt_f16((real * real) + (imag * imag), pDst++);
229
230 /* Decrement loop counter */
231 blkCnt--;
232 }
233
234 }
235 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
236
237 /**
238 @} end of cmplx_mag group
239 */
240
241 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */