1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_dot_prod_q15.c
4 * Description: Processing function for the Q15 Complex Dot product
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions.h"
30
31 /**
32 @ingroup groupCmplxMath
33 */
34
35 /**
36 @addtogroup cmplx_dot_prod
37 @{
38 */
39
40 /**
41 @brief Q15 complex dot product.
42 @param[in] pSrcA points to the first input vector
43 @param[in] pSrcB points to the second input vector
44 @param[in] numSamples number of samples in each vector
45 @param[out] realResult real part of the result returned here
46 @param[out] imagResult imaginary part of the result returned her
47
48 @par Scaling and Overflow Behavior
49 The function is implemented using an internal 64-bit accumulator.
50 The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
51 These are accumulated in a 64-bit accumulator with 34.30 precision.
52 As a final step, the accumulators are converted to 8.24 format.
53 The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
54 */
55
56 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_cmplx_dot_prod_q15(const q15_t * pSrcA,const q15_t * pSrcB,uint32_t numSamples,q31_t * realResult,q31_t * imagResult)57 ARM_DSP_ATTRIBUTE void arm_cmplx_dot_prod_q15(
58 const q15_t * pSrcA,
59 const q15_t * pSrcB,
60 uint32_t numSamples,
61 q31_t * realResult,
62 q31_t * imagResult)
63 {
64 int32_t blkCnt;
65 q63_t accReal = 0LL;
66 q63_t accImag = 0LL;
67 q15x8_t vecSrcA, vecSrcB;
68 q15x8_t vecSrcC, vecSrcD;
69
70 blkCnt = (numSamples >> 3);
71 blkCnt -= 1;
72 if (blkCnt > 0) {
73 /* should give more freedom to generate stall free code */
74 vecSrcA = vld1q(pSrcA);
75 vecSrcB = vld1q(pSrcB);
76 pSrcA += 8;
77 pSrcB += 8;
78
79 while (blkCnt > 0) {
80
81 accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
82 vecSrcC = vld1q(pSrcA);
83 pSrcA += 8;
84
85 accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
86 vecSrcD = vld1q(pSrcB);
87 pSrcB += 8;
88
89 accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
90 vecSrcA = vld1q(pSrcA);
91 pSrcA += 8;
92
93 accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
94 vecSrcB = vld1q(pSrcB);
95 pSrcB += 8;
96 /*
97 * Decrement the blockSize loop counter
98 */
99 blkCnt--;
100 }
101
102 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
103 accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
104 vecSrcC = vld1q(pSrcA);
105
106 accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
107 vecSrcD = vld1q(pSrcB);
108
109 accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
110 vecSrcA = vld1q(pSrcA);
111
112 accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
113 vecSrcB = vld1q(pSrcB);
114
115 /*
116 * tail
117 */
118 blkCnt = CMPLX_DIM * (numSamples & 7);
119 do {
120 mve_pred16_t p = vctp16q(blkCnt);
121
122 pSrcA += 8;
123 pSrcB += 8;
124
125 vecSrcA = vldrhq_z_s16(pSrcA, p);
126 vecSrcB = vldrhq_z_s16(pSrcB, p);
127
128 accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
129 accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
130
131 blkCnt -= 8;
132 }
133 while ((int32_t) blkCnt > 0);
134 } else {
135 blkCnt = numSamples * CMPLX_DIM;
136 while (blkCnt > 0) {
137 mve_pred16_t p = vctp16q(blkCnt);
138
139 vecSrcA = vldrhq_z_s16(pSrcA, p);
140 vecSrcB = vldrhq_z_s16(pSrcB, p);
141
142 accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
143 accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
144
145 /*
146 * Decrement the blkCnt loop counter
147 * Advance vector source and destination pointers
148 */
149 pSrcA += 8;
150 pSrcB += 8;
151 blkCnt -= 8;
152 }
153 }
154 *realResult = asrl(accReal, (14 - 8));
155 *imagResult = asrl(accImag, (14 - 8));
156 }
157 #else
arm_cmplx_dot_prod_q15(const q15_t * pSrcA,const q15_t * pSrcB,uint32_t numSamples,q31_t * realResult,q31_t * imagResult)158 ARM_DSP_ATTRIBUTE void arm_cmplx_dot_prod_q15(
159 const q15_t * pSrcA,
160 const q15_t * pSrcB,
161 uint32_t numSamples,
162 q31_t * realResult,
163 q31_t * imagResult)
164 {
165 uint32_t blkCnt; /* Loop counter */
166 q63_t real_sum = 0, imag_sum = 0; /* Temporary result variables */
167 q15_t a0,b0,c0,d0;
168
169 #if defined (ARM_MATH_LOOPUNROLL)
170 /* Loop unrolling: Compute 4 outputs at a time */
171 blkCnt = numSamples >> 2U;
172
173 while (blkCnt > 0U)
174 {
175 a0 = *pSrcA++;
176 b0 = *pSrcA++;
177 c0 = *pSrcB++;
178 d0 = *pSrcB++;
179
180 real_sum += (q31_t)a0 * c0;
181 imag_sum += (q31_t)a0 * d0;
182 real_sum -= (q31_t)b0 * d0;
183 imag_sum += (q31_t)b0 * c0;
184
185 a0 = *pSrcA++;
186 b0 = *pSrcA++;
187 c0 = *pSrcB++;
188 d0 = *pSrcB++;
189
190 real_sum += (q31_t)a0 * c0;
191 imag_sum += (q31_t)a0 * d0;
192 real_sum -= (q31_t)b0 * d0;
193 imag_sum += (q31_t)b0 * c0;
194
195 a0 = *pSrcA++;
196 b0 = *pSrcA++;
197 c0 = *pSrcB++;
198 d0 = *pSrcB++;
199
200 real_sum += (q31_t)a0 * c0;
201 imag_sum += (q31_t)a0 * d0;
202 real_sum -= (q31_t)b0 * d0;
203 imag_sum += (q31_t)b0 * c0;
204
205 a0 = *pSrcA++;
206 b0 = *pSrcA++;
207 c0 = *pSrcB++;
208 d0 = *pSrcB++;
209
210 real_sum += (q31_t)a0 * c0;
211 imag_sum += (q31_t)a0 * d0;
212 real_sum -= (q31_t)b0 * d0;
213 imag_sum += (q31_t)b0 * c0;
214
215 /* Decrement loop counter */
216 blkCnt--;
217 }
218
219 /* Loop unrolling: Compute remaining outputs */
220 blkCnt = numSamples % 0x4U;
221
222 #else
223
224 /* Initialize blkCnt with number of samples */
225 blkCnt = numSamples;
226
227 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
228
229 while (blkCnt > 0U)
230 {
231 a0 = *pSrcA++;
232 b0 = *pSrcA++;
233 c0 = *pSrcB++;
234 d0 = *pSrcB++;
235
236 real_sum += (q31_t)a0 * c0;
237 imag_sum += (q31_t)a0 * d0;
238 real_sum -= (q31_t)b0 * d0;
239 imag_sum += (q31_t)b0 * c0;
240
241 /* Decrement loop counter */
242 blkCnt--;
243 }
244
245 /* Store real and imaginary result in 8.24 format */
246 /* Convert real data in 34.30 to 8.24 by 6 right shifts */
247 *realResult = (q31_t) (real_sum >> 6);
248 /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
249 *imagResult = (q31_t) (imag_sum >> 6);
250 }
251 #endif /* defined(ARM_MATH_MVEI) */
252
253 /**
254 @} end of cmplx_dot_prod group
255 */
256