1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_dot_prod_q31.c
4 * Description: Q31 complex dot product
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions.h"
30
31 /**
32 @ingroup groupCmplxMath
33 */
34
35 /**
36 @addtogroup cmplx_dot_prod
37 @{
38 */
39
40 /**
41 @brief Q31 complex dot product.
42 @param[in] pSrcA points to the first input vector
43 @param[in] pSrcB points to the second input vector
44 @param[in] numSamples number of samples in each vector
45 @param[out] realResult real part of the result returned here
46 @param[out] imagResult imaginary part of the result returned here
47
48 @par Scaling and Overflow Behavior
49 The function is implemented using an internal 64-bit accumulator.
50 The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
51 The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
52 Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
53 The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
54 Input down scaling is not required.
55 */
56
57 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
58
arm_cmplx_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t numSamples,q63_t * realResult,q63_t * imagResult)59 ARM_DSP_ATTRIBUTE void arm_cmplx_dot_prod_q31(
60 const q31_t * pSrcA,
61 const q31_t * pSrcB,
62 uint32_t numSamples,
63 q63_t * realResult,
64 q63_t * imagResult)
65 {
66 int32_t blkCnt;
67 q63_t accReal = 0LL;
68 q63_t accImag = 0LL;
69 q31x4_t vecSrcA, vecSrcB;
70 q31x4_t vecSrcC, vecSrcD;
71
72 blkCnt = numSamples >> 2;
73 blkCnt -= 1;
74 if (blkCnt > 0) {
75 /* should give more freedom to generate stall free code */
76 vecSrcA = vld1q(pSrcA);
77 vecSrcB = vld1q(pSrcB);
78 pSrcA += 4;
79 pSrcB += 4;
80
81 while (blkCnt > 0) {
82
83 accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
84 vecSrcC = vld1q(pSrcA);
85 pSrcA += 4;
86
87 accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
88 vecSrcD = vld1q(pSrcB);
89 pSrcB += 4;
90
91 accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
92 vecSrcA = vld1q(pSrcA);
93 pSrcA += 4;
94
95 accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
96 vecSrcB = vld1q(pSrcB);
97 pSrcB += 4;
98 /*
99 * Decrement the blockSize loop counter
100 */
101 blkCnt--;
102 }
103
104 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
105 accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
106 vecSrcC = vld1q(pSrcA);
107
108 accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
109 vecSrcD = vld1q(pSrcB);
110
111 accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
112 vecSrcA = vld1q(pSrcA);
113
114 accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
115 vecSrcB = vld1q(pSrcB);
116
117 /*
118 * tail
119 */
120 blkCnt = CMPLX_DIM * (numSamples & 3);
121 do {
122 mve_pred16_t p = vctp32q(blkCnt);
123
124 pSrcA += 4;
125 pSrcB += 4;
126
127 vecSrcA = vldrwq_z_s32(pSrcA, p);
128 vecSrcB = vldrwq_z_s32(pSrcB, p);
129
130 accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
131 accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
132
133 blkCnt -= 4;
134 }
135 while ((int32_t) blkCnt > 0);
136 } else {
137 blkCnt = numSamples * CMPLX_DIM;
138 while (blkCnt > 0) {
139 mve_pred16_t p = vctp32q(blkCnt);
140
141 vecSrcA = vldrwq_z_s32(pSrcA, p);
142 vecSrcB = vldrwq_z_s32(pSrcB, p);
143
144 accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
145 accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
146
147 /*
148 * Decrement the blkCnt loop counter
149 * Advance vector source and destination pointers
150 */
151 pSrcA += 4;
152 pSrcB += 4;
153 blkCnt -= 4;
154 }
155 }
156 *realResult = asrl(accReal, (14 - 8));
157 *imagResult = asrl(accImag, (14 - 8));
158
159 }
160
161 #else
arm_cmplx_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t numSamples,q63_t * realResult,q63_t * imagResult)162 ARM_DSP_ATTRIBUTE void arm_cmplx_dot_prod_q31(
163 const q31_t * pSrcA,
164 const q31_t * pSrcB,
165 uint32_t numSamples,
166 q63_t * realResult,
167 q63_t * imagResult)
168 {
169 uint32_t blkCnt; /* Loop counter */
170 q63_t real_sum = 0, imag_sum = 0; /* Temporary result variables */
171 q31_t a0,b0,c0,d0;
172
173 #if defined (ARM_MATH_LOOPUNROLL)
174
175 /* Loop unrolling: Compute 4 outputs at a time */
176 blkCnt = numSamples >> 2U;
177
178 while (blkCnt > 0U)
179 {
180 a0 = *pSrcA++;
181 b0 = *pSrcA++;
182 c0 = *pSrcB++;
183 d0 = *pSrcB++;
184
185 real_sum += ((q63_t)a0 * c0) >> 14;
186 imag_sum += ((q63_t)a0 * d0) >> 14;
187 real_sum -= ((q63_t)b0 * d0) >> 14;
188 imag_sum += ((q63_t)b0 * c0) >> 14;
189
190 a0 = *pSrcA++;
191 b0 = *pSrcA++;
192 c0 = *pSrcB++;
193 d0 = *pSrcB++;
194
195 real_sum += ((q63_t)a0 * c0) >> 14;
196 imag_sum += ((q63_t)a0 * d0) >> 14;
197 real_sum -= ((q63_t)b0 * d0) >> 14;
198 imag_sum += ((q63_t)b0 * c0) >> 14;
199
200 a0 = *pSrcA++;
201 b0 = *pSrcA++;
202 c0 = *pSrcB++;
203 d0 = *pSrcB++;
204
205 real_sum += ((q63_t)a0 * c0) >> 14;
206 imag_sum += ((q63_t)a0 * d0) >> 14;
207 real_sum -= ((q63_t)b0 * d0) >> 14;
208 imag_sum += ((q63_t)b0 * c0) >> 14;
209
210 a0 = *pSrcA++;
211 b0 = *pSrcA++;
212 c0 = *pSrcB++;
213 d0 = *pSrcB++;
214
215 real_sum += ((q63_t)a0 * c0) >> 14;
216 imag_sum += ((q63_t)a0 * d0) >> 14;
217 real_sum -= ((q63_t)b0 * d0) >> 14;
218 imag_sum += ((q63_t)b0 * c0) >> 14;
219
220 /* Decrement loop counter */
221 blkCnt--;
222 }
223
224 /* Loop unrolling: Compute remaining outputs */
225 blkCnt = numSamples % 0x4U;
226
227 #else
228
229 /* Initialize blkCnt with number of samples */
230 blkCnt = numSamples;
231
232 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
233
234 while (blkCnt > 0U)
235 {
236 a0 = *pSrcA++;
237 b0 = *pSrcA++;
238 c0 = *pSrcB++;
239 d0 = *pSrcB++;
240
241 real_sum += ((q63_t)a0 * c0) >> 14;
242 imag_sum += ((q63_t)a0 * d0) >> 14;
243 real_sum -= ((q63_t)b0 * d0) >> 14;
244 imag_sum += ((q63_t)b0 * c0) >> 14;
245
246 /* Decrement loop counter */
247 blkCnt--;
248 }
249
250 /* Store real and imaginary result in 16.48 format */
251 *realResult = real_sum;
252 *imagResult = imag_sum;
253 }
254 #endif /* defined(ARM_MATH_MVEI) */
255
256 /**
257 @} end of cmplx_dot_prod group
258 */
259