1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_dot_prod_f32.c
4 * Description: Floating-point complex dot product
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions.h"
30
31 /**
32 @ingroup groupCmplxMath
33 */
34
35 /**
36 @defgroup cmplx_dot_prod Complex Dot Product
37
38 Computes the dot product of two complex vectors.
39 The vectors are multiplied element-by-element and then summed.
40
41 The <code>pSrcA</code> points to the first complex input vector and
42 <code>pSrcB</code> points to the second complex input vector.
43 <code>numSamples</code> specifies the number of complex samples
44 and the data in each array is stored in an interleaved fashion
45 (real, imag, real, imag, ...).
46 Each array has a total of <code>2*numSamples</code> values.
47
48 The underlying algorithm is used:
49
50 <pre>
51 realResult = 0;
52 imagResult = 0;
53 for (n = 0; n < numSamples; n++) {
54 realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
55 imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
56 }
57 </pre>
58
59 There are separate functions for floating-point, Q15, and Q31 data types.
60 */
61
62 /**
63 @addtogroup cmplx_dot_prod
64 @{
65 */
66
67 /**
68 @brief Floating-point complex dot product.
69 @param[in] pSrcA points to the first input vector
70 @param[in] pSrcB points to the second input vector
71 @param[in] numSamples number of samples in each vector
72 @param[out] realResult real part of the result returned here
73 @param[out] imagResult imaginary part of the result returned here
74 */
75
76 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
77
arm_cmplx_dot_prod_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t numSamples,float32_t * realResult,float32_t * imagResult)78 ARM_DSP_ATTRIBUTE void arm_cmplx_dot_prod_f32(
79 const float32_t * pSrcA,
80 const float32_t * pSrcB,
81 uint32_t numSamples,
82 float32_t * realResult,
83 float32_t * imagResult)
84 {
85 int32_t blkCnt;
86 float32_t real_sum, imag_sum;
87 f32x4_t vecSrcA, vecSrcB;
88 f32x4_t vec_acc = vdupq_n_f32(0.0f);
89 f32x4_t vecSrcC, vecSrcD;
90
91 blkCnt = numSamples >> 2;
92 blkCnt -= 1;
93 if (blkCnt > 0) {
94 /* should give more freedom to generate stall free code */
95 vecSrcA = vld1q(pSrcA);
96 vecSrcB = vld1q(pSrcB);
97 pSrcA += 4;
98 pSrcB += 4;
99
100 while (blkCnt > 0) {
101 vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
102 vecSrcC = vld1q(pSrcA);
103 pSrcA += 4;
104
105 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
106 vecSrcD = vld1q(pSrcB);
107 pSrcB += 4;
108
109 vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
110 vecSrcA = vld1q(pSrcA);
111 pSrcA += 4;
112
113 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
114 vecSrcB = vld1q(pSrcB);
115 pSrcB += 4;
116 /*
117 * Decrement the blockSize loop counter
118 */
119 blkCnt--;
120 }
121
122 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
123 vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
124 vecSrcC = vld1q(pSrcA);
125
126 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
127 vecSrcD = vld1q(pSrcB);
128
129 vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
130 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
131
132 /*
133 * tail
134 */
135 blkCnt = CMPLX_DIM * (numSamples & 3);
136 while (blkCnt > 0) {
137 mve_pred16_t p = vctp32q(blkCnt);
138 pSrcA += 4;
139 pSrcB += 4;
140 vecSrcA = vldrwq_z_f32(pSrcA, p);
141 vecSrcB = vldrwq_z_f32(pSrcB, p);
142 vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
143 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
144 blkCnt -= 4;
145 }
146 } else {
147 /* small vector */
148 blkCnt = numSamples * CMPLX_DIM;
149 vec_acc = vdupq_n_f32(0.0f);
150
151 do {
152 mve_pred16_t p = vctp32q(blkCnt);
153
154 vecSrcA = vldrwq_z_f32(pSrcA, p);
155 vecSrcB = vldrwq_z_f32(pSrcB, p);
156
157 vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
158 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
159
160 /*
161 * Decrement the blkCnt loop counter
162 * Advance vector source and destination pointers
163 */
164 pSrcA += 4;
165 pSrcB += 4;
166 blkCnt -= 4;
167 }
168 while (blkCnt > 0);
169 }
170
171 real_sum = vgetq_lane(vec_acc, 0) + vgetq_lane(vec_acc, 2);
172 imag_sum = vgetq_lane(vec_acc, 1) + vgetq_lane(vec_acc, 3);
173
174 /*
175 * Store the real and imaginary results in the destination buffers
176 */
177 *realResult = real_sum;
178 *imagResult = imag_sum;
179 }
180
181 #else
arm_cmplx_dot_prod_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t numSamples,float32_t * realResult,float32_t * imagResult)182 ARM_DSP_ATTRIBUTE void arm_cmplx_dot_prod_f32(
183 const float32_t * pSrcA,
184 const float32_t * pSrcB,
185 uint32_t numSamples,
186 float32_t * realResult,
187 float32_t * imagResult)
188 {
189 uint32_t blkCnt; /* Loop counter */
190 float32_t real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result variables */
191 float32_t a0,b0,c0,d0;
192
193 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
194 float32x4x2_t vec1,vec2,vec3,vec4;
195 float32x4_t accR,accI;
196 float32x2_t accum = vdup_n_f32(0);
197
198 accR = vdupq_n_f32(0.0f);
199 accI = vdupq_n_f32(0.0f);
200
201 /* Loop unrolling: Compute 8 outputs at a time */
202 blkCnt = numSamples >> 3U;
203
204 while (blkCnt > 0U)
205 {
206 /* C = (A[0]+jA[1])*(B[0]+jB[1]) + ... */
207 /* Calculate dot product and then store the result in a temporary buffer. */
208
209 vec1 = vld2q_f32(pSrcA);
210 vec2 = vld2q_f32(pSrcB);
211
212 /* Increment pointers */
213 pSrcA += 8;
214 pSrcB += 8;
215
216 /* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
217 accR = vmlaq_f32(accR,vec1.val[0],vec2.val[0]);
218 accR = vmlsq_f32(accR,vec1.val[1],vec2.val[1]);
219
220 /* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
221 accI = vmlaq_f32(accI,vec1.val[1],vec2.val[0]);
222 accI = vmlaq_f32(accI,vec1.val[0],vec2.val[1]);
223
224 vec3 = vld2q_f32(pSrcA);
225 vec4 = vld2q_f32(pSrcB);
226
227 /* Increment pointers */
228 pSrcA += 8;
229 pSrcB += 8;
230
231 /* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
232 accR = vmlaq_f32(accR,vec3.val[0],vec4.val[0]);
233 accR = vmlsq_f32(accR,vec3.val[1],vec4.val[1]);
234
235 /* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
236 accI = vmlaq_f32(accI,vec3.val[1],vec4.val[0]);
237 accI = vmlaq_f32(accI,vec3.val[0],vec4.val[1]);
238
239 /* Decrement the loop counter */
240 blkCnt--;
241 }
242
243 accum = vpadd_f32(vget_low_f32(accR), vget_high_f32(accR));
244 real_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
245
246 accum = vpadd_f32(vget_low_f32(accI), vget_high_f32(accI));
247 imag_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
248
249 /* Tail */
250 blkCnt = numSamples & 0x7;
251
252 #else
253 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
254
255 /* Loop unrolling: Compute 4 outputs at a time */
256 blkCnt = numSamples >> 2U;
257
258 while (blkCnt > 0U)
259 {
260 a0 = *pSrcA++;
261 b0 = *pSrcA++;
262 c0 = *pSrcB++;
263 d0 = *pSrcB++;
264
265 real_sum += a0 * c0;
266 imag_sum += a0 * d0;
267 real_sum -= b0 * d0;
268 imag_sum += b0 * c0;
269
270 a0 = *pSrcA++;
271 b0 = *pSrcA++;
272 c0 = *pSrcB++;
273 d0 = *pSrcB++;
274
275 real_sum += a0 * c0;
276 imag_sum += a0 * d0;
277 real_sum -= b0 * d0;
278 imag_sum += b0 * c0;
279
280 a0 = *pSrcA++;
281 b0 = *pSrcA++;
282 c0 = *pSrcB++;
283 d0 = *pSrcB++;
284
285 real_sum += a0 * c0;
286 imag_sum += a0 * d0;
287 real_sum -= b0 * d0;
288 imag_sum += b0 * c0;
289
290 a0 = *pSrcA++;
291 b0 = *pSrcA++;
292 c0 = *pSrcB++;
293 d0 = *pSrcB++;
294
295 real_sum += a0 * c0;
296 imag_sum += a0 * d0;
297 real_sum -= b0 * d0;
298 imag_sum += b0 * c0;
299
300 /* Decrement loop counter */
301 blkCnt--;
302 }
303
304 /* Loop unrolling: Compute remaining outputs */
305 blkCnt = numSamples % 0x4U;
306
307 #else
308
309 /* Initialize blkCnt with number of samples */
310 blkCnt = numSamples;
311
312 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
313 #endif /* #if defined(ARM_MATH_NEON) */
314
315 while (blkCnt > 0U)
316 {
317 a0 = *pSrcA++;
318 b0 = *pSrcA++;
319 c0 = *pSrcB++;
320 d0 = *pSrcB++;
321
322 real_sum += a0 * c0;
323 imag_sum += a0 * d0;
324 real_sum -= b0 * d0;
325 imag_sum += b0 * c0;
326
327 /* Decrement loop counter */
328 blkCnt--;
329 }
330
331 /* Store real and imaginary result in destination buffer. */
332 *realResult = real_sum;
333 *imagResult = imag_sum;
334 }
335 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
336
337 /**
338 @} end of cmplx_dot_prod group
339 */
340