1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_dot_prod_q31.c
4 * Description: Q31 complex dot product
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions.h"
30
31 /**
32 @ingroup groupCmplxMath
33 */
34
35 /**
36 @addtogroup cmplx_dot_prod
37 @{
38 */
39
40 /**
41 @brief Q31 complex dot product.
42 @param[in] pSrcA points to the first input vector
43 @param[in] pSrcB points to the second input vector
44 @param[in] numSamples number of samples in each vector
45 @param[out] realResult real part of the result returned here
46 @param[out] imagResult imaginary part of the result returned here
47 @return none
48
49 @par Scaling and Overflow Behavior
50 The function is implemented using an internal 64-bit accumulator.
51 The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
52 The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
53 Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
54 The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
55 Input down scaling is not required.
56 */
57
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
59
arm_cmplx_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t numSamples,q63_t * realResult,q63_t * imagResult)60 void arm_cmplx_dot_prod_q31(
61 const q31_t * pSrcA,
62 const q31_t * pSrcB,
63 uint32_t numSamples,
64 q63_t * realResult,
65 q63_t * imagResult)
66 {
67 int32_t blkCnt;
68 q63_t accReal = 0LL;
69 q63_t accImag = 0LL;
70 q31x4_t vecSrcA, vecSrcB;
71 q31x4_t vecSrcC, vecSrcD;
72
73 blkCnt = numSamples >> 2;
74 blkCnt -= 1;
75 if (blkCnt > 0) {
76 /* should give more freedom to generate stall free code */
77 vecSrcA = vld1q(pSrcA);
78 vecSrcB = vld1q(pSrcB);
79 pSrcA += 4;
80 pSrcB += 4;
81
82 while (blkCnt > 0) {
83
84 accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
85 vecSrcC = vld1q(pSrcA);
86 pSrcA += 4;
87
88 accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
89 vecSrcD = vld1q(pSrcB);
90 pSrcB += 4;
91
92 accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
93 vecSrcA = vld1q(pSrcA);
94 pSrcA += 4;
95
96 accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
97 vecSrcB = vld1q(pSrcB);
98 pSrcB += 4;
99 /*
100 * Decrement the blockSize loop counter
101 */
102 blkCnt--;
103 }
104
105 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
106 accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
107 vecSrcC = vld1q(pSrcA);
108
109 accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
110 vecSrcD = vld1q(pSrcB);
111
112 accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
113 vecSrcA = vld1q(pSrcA);
114
115 accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
116 vecSrcB = vld1q(pSrcB);
117
118 /*
119 * tail
120 */
121 blkCnt = CMPLX_DIM * (numSamples & 3);
122 do {
123 mve_pred16_t p = vctp32q(blkCnt);
124
125 pSrcA += 4;
126 pSrcB += 4;
127
128 vecSrcA = vldrwq_z_s32(pSrcA, p);
129 vecSrcB = vldrwq_z_s32(pSrcB, p);
130
131 accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
132 accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
133
134 blkCnt -= 4;
135 }
136 while ((int32_t) blkCnt > 0);
137 } else {
138 blkCnt = numSamples * CMPLX_DIM;
139 while (blkCnt > 0) {
140 mve_pred16_t p = vctp32q(blkCnt);
141
142 vecSrcA = vldrwq_z_s32(pSrcA, p);
143 vecSrcB = vldrwq_z_s32(pSrcB, p);
144
145 accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
146 accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
147
148 /*
149 * Decrement the blkCnt loop counter
150 * Advance vector source and destination pointers
151 */
152 pSrcA += 4;
153 pSrcB += 4;
154 blkCnt -= 4;
155 }
156 }
157 *realResult = asrl(accReal, (14 - 8));
158 *imagResult = asrl(accImag, (14 - 8));
159
160 }
161
162 #else
arm_cmplx_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t numSamples,q63_t * realResult,q63_t * imagResult)163 void arm_cmplx_dot_prod_q31(
164 const q31_t * pSrcA,
165 const q31_t * pSrcB,
166 uint32_t numSamples,
167 q63_t * realResult,
168 q63_t * imagResult)
169 {
170 uint32_t blkCnt; /* Loop counter */
171 q63_t real_sum = 0, imag_sum = 0; /* Temporary result variables */
172 q31_t a0,b0,c0,d0;
173
174 #if defined (ARM_MATH_LOOPUNROLL)
175
176 /* Loop unrolling: Compute 4 outputs at a time */
177 blkCnt = numSamples >> 2U;
178
179 while (blkCnt > 0U)
180 {
181 a0 = *pSrcA++;
182 b0 = *pSrcA++;
183 c0 = *pSrcB++;
184 d0 = *pSrcB++;
185
186 real_sum += ((q63_t)a0 * c0) >> 14;
187 imag_sum += ((q63_t)a0 * d0) >> 14;
188 real_sum -= ((q63_t)b0 * d0) >> 14;
189 imag_sum += ((q63_t)b0 * c0) >> 14;
190
191 a0 = *pSrcA++;
192 b0 = *pSrcA++;
193 c0 = *pSrcB++;
194 d0 = *pSrcB++;
195
196 real_sum += ((q63_t)a0 * c0) >> 14;
197 imag_sum += ((q63_t)a0 * d0) >> 14;
198 real_sum -= ((q63_t)b0 * d0) >> 14;
199 imag_sum += ((q63_t)b0 * c0) >> 14;
200
201 a0 = *pSrcA++;
202 b0 = *pSrcA++;
203 c0 = *pSrcB++;
204 d0 = *pSrcB++;
205
206 real_sum += ((q63_t)a0 * c0) >> 14;
207 imag_sum += ((q63_t)a0 * d0) >> 14;
208 real_sum -= ((q63_t)b0 * d0) >> 14;
209 imag_sum += ((q63_t)b0 * c0) >> 14;
210
211 a0 = *pSrcA++;
212 b0 = *pSrcA++;
213 c0 = *pSrcB++;
214 d0 = *pSrcB++;
215
216 real_sum += ((q63_t)a0 * c0) >> 14;
217 imag_sum += ((q63_t)a0 * d0) >> 14;
218 real_sum -= ((q63_t)b0 * d0) >> 14;
219 imag_sum += ((q63_t)b0 * c0) >> 14;
220
221 /* Decrement loop counter */
222 blkCnt--;
223 }
224
225 /* Loop unrolling: Compute remaining outputs */
226 blkCnt = numSamples % 0x4U;
227
228 #else
229
230 /* Initialize blkCnt with number of samples */
231 blkCnt = numSamples;
232
233 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
234
235 while (blkCnt > 0U)
236 {
237 a0 = *pSrcA++;
238 b0 = *pSrcA++;
239 c0 = *pSrcB++;
240 d0 = *pSrcB++;
241
242 real_sum += ((q63_t)a0 * c0) >> 14;
243 imag_sum += ((q63_t)a0 * d0) >> 14;
244 real_sum -= ((q63_t)b0 * d0) >> 14;
245 imag_sum += ((q63_t)b0 * c0) >> 14;
246
247 /* Decrement loop counter */
248 blkCnt--;
249 }
250
251 /* Store real and imaginary result in 16.48 format */
252 *realResult = real_sum;
253 *imagResult = imag_sum;
254 }
255 #endif /* defined(ARM_MATH_MVEI) */
256
257 /**
258 @} end of cmplx_dot_prod group
259 */
260