1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mag_f16.c
4 * Description: Floating-point complex magnitude
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 /**
33 @ingroup groupCmplxMath
34 */
35
36
37
38 /**
39 @addtogroup cmplx_mag
40 @{
41 */
42
43 /**
44 @brief Floating-point complex magnitude.
45 @param[in] pSrc points to input vector
46 @param[out] pDst points to output vector
47 @param[in] numSamples number of samples in each vector
48 */
49
50 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
51
52 #include "arm_helium_utils.h"
53
54
arm_cmplx_mag_f16(const float16_t * pSrc,float16_t * pDst,uint32_t numSamples)55 void arm_cmplx_mag_f16(
56 const float16_t * pSrc,
57 float16_t * pDst,
58 uint32_t numSamples)
59 {
60 int32_t blockSize = numSamples; /* loop counters */
61 uint32_t blkCnt; /* loop counters */
62 f16x8x2_t vecSrc;
63 f16x8_t sum;
64
65 /* Compute 4 complex samples at a time */
66 blkCnt = blockSize >> 3;
67 while (blkCnt > 0U)
68 {
69 q15x8_t newtonStartVec;
70 f16x8_t sumHalf, invSqrt;
71
72 vecSrc = vld2q(pSrc);
73 pSrc += 16;
74 sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
75 sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
76
77 /*
78 * inlined Fast SQRT using inverse SQRT newton-raphson method
79 */
80
81 /* compute initial value */
82 newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
83 sumHalf = sum * 0.5f;
84 /*
85 * compute 3 x iterations
86 *
87 * The more iterations, the more accuracy.
88 * If you need to trade a bit of accuracy for more performance,
89 * you can comment out the 3rd use of the macro.
90 */
91 INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
92 INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
93 INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
94 /*
95 * set negative values to 0
96 */
97 invSqrt = vdupq_m(invSqrt, (float16_t)0.0f, vcmpltq(invSqrt, (float16_t)0.0f));
98 /*
99 * sqrt(x) = x * invSqrt(x)
100 */
101 sum = vmulq(sum, invSqrt);
102 vstrhq_f16(pDst, sum);
103 pDst += 8;
104 /*
105 * Decrement the blockSize loop counter
106 */
107 blkCnt--;
108 }
109 /*
110 * tail
111 */
112 blkCnt = blockSize & 7;
113 if (blkCnt > 0U)
114 {
115 mve_pred16_t p0 = vctp16q(blkCnt);
116 q15x8_t newtonStartVec;
117 f16x8_t sumHalf, invSqrt;
118
119 vecSrc = vld2q((float16_t const *)pSrc);
120 sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
121 sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
122
123 /*
124 * inlined Fast SQRT using inverse SQRT newton-raphson method
125 */
126
127 /* compute initial value */
128 newtonStartVec = vdupq_n_s16(INVSQRT_MAGIC_F16) - vshrq((q15x8_t) sum, 1);
129 sumHalf = vmulq(sum, (float16_t)0.5);
130 /*
131 * compute 2 x iterations
132 */
133 INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, (f16x8_t) newtonStartVec);
134 INVSQRT_NEWTON_MVE_F16(invSqrt, sumHalf, invSqrt);
135 /*
136 * set negative values to 0
137 */
138 invSqrt = vdupq_m(invSqrt, (float16_t)0.0, vcmpltq(invSqrt, (float16_t)0.0));
139 /*
140 * sqrt(x) = x * invSqrt(x)
141 */
142 sum = vmulq(sum, invSqrt);
143 vstrhq_p_f16(pDst, sum, p0);
144 }
145 }
146
147 #else
arm_cmplx_mag_f16(const float16_t * pSrc,float16_t * pDst,uint32_t numSamples)148 void arm_cmplx_mag_f16(
149 const float16_t * pSrc,
150 float16_t * pDst,
151 uint32_t numSamples)
152 {
153 uint32_t blkCnt; /* loop counter */
154 _Float16 real, imag; /* Temporary variables to hold input values */
155
156 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
157
158 /* Loop unrolling: Compute 4 outputs at a time */
159 blkCnt = numSamples >> 2U;
160
161 while (blkCnt > 0U)
162 {
163 /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
164
165 real = *pSrc++;
166 imag = *pSrc++;
167
168 /* store result in destination buffer. */
169 arm_sqrt_f16((real * real) + (imag * imag), pDst++);
170
171 real = *pSrc++;
172 imag = *pSrc++;
173 arm_sqrt_f16((real * real) + (imag * imag), pDst++);
174
175 real = *pSrc++;
176 imag = *pSrc++;
177 arm_sqrt_f16((real * real) + (imag * imag), pDst++);
178
179 real = *pSrc++;
180 imag = *pSrc++;
181 arm_sqrt_f16((real * real) + (imag * imag), pDst++);
182
183 /* Decrement loop counter */
184 blkCnt--;
185 }
186
187 /* Loop unrolling: Compute remaining outputs */
188 blkCnt = numSamples % 0x4U;
189
190 #else
191
192 /* Initialize blkCnt with number of samples */
193 blkCnt = numSamples;
194
195 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
196
197 while (blkCnt > 0U)
198 {
199 /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
200
201 real = *pSrc++;
202 imag = *pSrc++;
203
204 /* store result in destination buffer. */
205 arm_sqrt_f16((real * real) + (imag * imag), pDst++);
206
207 /* Decrement loop counter */
208 blkCnt--;
209 }
210
211 }
212 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
213
214 /**
215 @} end of cmplx_mag group
216 */
217
218 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
219