1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mult_cmplx_f32.c
4 * Description: Floating-point complex-by-complex multiplication
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions.h"
30
31 /**
32 @ingroup groupCmplxMath
33 */
34
35 /**
36 @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
37
38 Multiplies a complex vector by another complex vector and generates a complex result.
39 The data in the complex arrays is stored in an interleaved fashion
40 (real, imag, real, imag, ...).
41 The parameter <code>numSamples</code> represents the number of complex
42 samples processed. The complex arrays have a total of <code>2*numSamples</code>
43 real values.
44
45 The underlying algorithm is used:
46
47 <pre>
48 for (n = 0; n < numSamples; n++) {
49 pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
50 pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
51 }
52 </pre>
53
54 There are separate functions for floating-point, Q15, and Q31 data types.
55 */
56
57 /**
58 @addtogroup CmplxByCmplxMult
59 @{
60 */
61
62 /**
63 @brief Floating-point complex-by-complex multiplication.
64 @param[in] pSrcA points to first input vector
65 @param[in] pSrcB points to second input vector
66 @param[out] pDst points to output vector
67 @param[in] numSamples number of samples in each vector
68 */
69
70 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
71
arm_cmplx_mult_cmplx_f32(const float32_t * pSrcA,const float32_t * pSrcB,float32_t * pDst,uint32_t numSamples)72 ARM_DSP_ATTRIBUTE void arm_cmplx_mult_cmplx_f32(
73 const float32_t * pSrcA,
74 const float32_t * pSrcB,
75 float32_t * pDst,
76 uint32_t numSamples)
77 {
78 int32_t blkCnt;
79 f32x4_t vecSrcA, vecSrcB;
80 f32x4_t vecSrcC, vecSrcD;
81 f32x4_t vec_acc;
82
83 blkCnt = numSamples >> 2;
84 blkCnt -= 1;
85 if (blkCnt > 0) {
86 /* should give more freedom to generate stall free code */
87 vecSrcA = vld1q(pSrcA);
88 vecSrcB = vld1q(pSrcB);
89 pSrcA += 4;
90 pSrcB += 4;
91
92 while (blkCnt > 0) {
93 vec_acc = vcmulq(vecSrcA, vecSrcB);
94 vecSrcC = vld1q(pSrcA);
95 pSrcA += 4;
96
97 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
98 vecSrcD = vld1q(pSrcB);
99 pSrcB += 4;
100 vst1q(pDst, vec_acc);
101 pDst += 4;
102
103 vec_acc = vcmulq(vecSrcC, vecSrcD);
104 vecSrcA = vld1q(pSrcA);
105 pSrcA += 4;
106
107 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
108 vecSrcB = vld1q(pSrcB);
109 pSrcB += 4;
110 vst1q(pDst, vec_acc);
111 pDst += 4;
112 /*
113 * Decrement the blockSize loop counter
114 */
115 blkCnt--;
116 }
117
118 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
119 vec_acc = vcmulq(vecSrcA, vecSrcB);
120 vecSrcC = vld1q(pSrcA);
121
122 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
123 vecSrcD = vld1q(pSrcB);
124 vst1q(pDst, vec_acc);
125 pDst += 4;
126
127 vec_acc = vcmulq(vecSrcC, vecSrcD);
128 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
129 vst1q(pDst, vec_acc);
130 pDst += 4;
131
132 /*
133 * tail
134 */
135 blkCnt = CMPLX_DIM * (numSamples & 3);
136 while (blkCnt > 0) {
137 mve_pred16_t p = vctp32q(blkCnt);
138 pSrcA += 4;
139 pSrcB += 4;
140
141 vecSrcA = vldrwq_z_f32(pSrcA, p);
142 vecSrcB = vldrwq_z_f32(pSrcB, p);
143 vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
144 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
145
146 vstrwq_p_f32(pDst, vec_acc, p);
147 pDst += 4;
148
149 blkCnt -= 4;
150 }
151 } else {
152 /* small vector */
153 blkCnt = numSamples * CMPLX_DIM;
154 vec_acc = vdupq_n_f32(0.0f);
155
156 do {
157 mve_pred16_t p = vctp32q(blkCnt);
158
159 vecSrcA = vldrwq_z_f32(pSrcA, p);
160 vecSrcB = vldrwq_z_f32(pSrcB, p);
161
162 vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
163 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
164 vstrwq_p_f32(pDst, vec_acc, p);
165 pDst += 4;
166
167 /*
168 * Decrement the blkCnt loop counter
169 * Advance vector source and destination pointers
170 */
171 pSrcA += 4;
172 pSrcB += 4;
173 blkCnt -= 4;
174 }
175 while (blkCnt > 0);
176 }
177
178 }
179
180 #else
arm_cmplx_mult_cmplx_f32(const float32_t * pSrcA,const float32_t * pSrcB,float32_t * pDst,uint32_t numSamples)181 ARM_DSP_ATTRIBUTE void arm_cmplx_mult_cmplx_f32(
182 const float32_t * pSrcA,
183 const float32_t * pSrcB,
184 float32_t * pDst,
185 uint32_t numSamples)
186 {
187 uint32_t blkCnt; /* Loop counter */
188 float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */
189
190 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
191 float32x4x2_t va, vb;
192 float32x4x2_t outCplx;
193
194 /* Compute 4 outputs at a time */
195 blkCnt = numSamples >> 2U;
196
197 while (blkCnt > 0U)
198 {
199 va = vld2q_f32(pSrcA); // load & separate real/imag pSrcA (de-interleave 2)
200 vb = vld2q_f32(pSrcB); // load & separate real/imag pSrcB
201
202 /* Increment pointers */
203 pSrcA += 8;
204 pSrcB += 8;
205
206 /* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
207 outCplx.val[0] = vmulq_f32(va.val[0], vb.val[0]);
208 outCplx.val[0] = vmlsq_f32(outCplx.val[0], va.val[1], vb.val[1]);
209
210 /* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
211 outCplx.val[1] = vmulq_f32(va.val[0], vb.val[1]);
212 outCplx.val[1] = vmlaq_f32(outCplx.val[1], va.val[1], vb.val[0]);
213
214 vst2q_f32(pDst, outCplx);
215
216 /* Increment pointer */
217 pDst += 8;
218
219 /* Decrement the loop counter */
220 blkCnt--;
221 }
222
223 /* Tail */
224 blkCnt = numSamples & 3;
225
226 #else
227 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
228
229 /* Loop unrolling: Compute 4 outputs at a time */
230 blkCnt = numSamples >> 2U;
231
232 while (blkCnt > 0U)
233 {
234 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
235 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
236
237 a = *pSrcA++;
238 b = *pSrcA++;
239 c = *pSrcB++;
240 d = *pSrcB++;
241 /* store result in destination buffer. */
242 *pDst++ = (a * c) - (b * d);
243 *pDst++ = (a * d) + (b * c);
244
245 a = *pSrcA++;
246 b = *pSrcA++;
247 c = *pSrcB++;
248 d = *pSrcB++;
249 *pDst++ = (a * c) - (b * d);
250 *pDst++ = (a * d) + (b * c);
251
252 a = *pSrcA++;
253 b = *pSrcA++;
254 c = *pSrcB++;
255 d = *pSrcB++;
256 *pDst++ = (a * c) - (b * d);
257 *pDst++ = (a * d) + (b * c);
258
259 a = *pSrcA++;
260 b = *pSrcA++;
261 c = *pSrcB++;
262 d = *pSrcB++;
263 *pDst++ = (a * c) - (b * d);
264 *pDst++ = (a * d) + (b * c);
265
266 /* Decrement loop counter */
267 blkCnt--;
268 }
269
270 /* Loop unrolling: Compute remaining outputs */
271 blkCnt = numSamples % 0x4U;
272
273 #else
274
275 /* Initialize blkCnt with number of samples */
276 blkCnt = numSamples;
277
278 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
279 #endif /* #if defined(ARM_MATH_NEON) */
280
281 while (blkCnt > 0U)
282 {
283 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
284 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
285
286 a = *pSrcA++;
287 b = *pSrcA++;
288 c = *pSrcB++;
289 d = *pSrcB++;
290
291 /* store result in destination buffer. */
292 *pDst++ = (a * c) - (b * d);
293 *pDst++ = (a * d) + (b * c);
294
295 /* Decrement loop counter */
296 blkCnt--;
297 }
298
299 }
300 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
301
302 /**
303 @} end of CmplxByCmplxMult group
304 */
305