1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mult_cmplx_q31.c
4 * Description: Q31 complex-by-complex multiplication
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions.h"
30
31 /**
32 @ingroup groupCmplxMath
33 */
34
35 /**
36 @addtogroup CmplxByCmplxMult
37 @{
38 */
39
40 /**
41 @brief Q31 complex-by-complex multiplication.
42 @param[in] pSrcA points to first input vector
43 @param[in] pSrcB points to second input vector
44 @param[out] pDst points to output vector
45 @param[in] numSamples number of samples in each vector
46
47 @par Scaling and Overflow Behavior
48 The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
49 Input down scaling is not required.
50 */
51
52 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_cmplx_mult_cmplx_q31(const q31_t * pSrcA,const q31_t * pSrcB,q31_t * pDst,uint32_t numSamples)53 void arm_cmplx_mult_cmplx_q31(
54 const q31_t * pSrcA,
55 const q31_t * pSrcB,
56 q31_t * pDst,
57 uint32_t numSamples)
58 {
59 int32_t blkCnt;
60 q31x4_t vecSrcA, vecSrcB;
61 q31x4_t vecSrcC, vecSrcD;
62 q31x4_t vecDst;
63
64 blkCnt = numSamples >> 2;
65 blkCnt -= 1;
66 if (blkCnt > 0) {
67 /* should give more freedom to generate stall free code */
68 vecSrcA = vld1q(pSrcA);
69 vecSrcB = vld1q(pSrcB);
70 pSrcA += 4;
71 pSrcB += 4;
72
73 while (blkCnt > 0) {
74
75 /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
76 vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
77 vecSrcC = vld1q(pSrcA);
78 pSrcA += 4;
79
80 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
81 vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
82 vecSrcD = vld1q(pSrcB);
83 pSrcB += 4;
84
85 vst1q(pDst, vshrq(vecDst, 2));
86 pDst += 4;
87
88 vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
89 vecSrcA = vld1q(pSrcA);
90 pSrcA += 4;
91
92 vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
93 vecSrcB = vld1q(pSrcB);
94 pSrcB += 4;
95
96 vst1q(pDst, vshrq(vecDst, 2));
97 pDst += 4;
98
99 /*
100 * Decrement the blockSize loop counter
101 */
102 blkCnt--;
103 }
104
105 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
106 vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
107 vecSrcC = vld1q(pSrcA);
108
109 vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
110 vecSrcD = vld1q(pSrcB);
111
112 vst1q(pDst, vshrq(vecDst, 2));
113 pDst += 4;
114
115 vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
116 vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
117
118 vst1q(pDst, vshrq(vecDst, 2));
119 pDst += 4;
120
121 /*
122 * tail
123 */
124 blkCnt = CMPLX_DIM * (numSamples & 3);
125 do {
126 mve_pred16_t p = vctp32q(blkCnt);
127
128 pSrcA += 4;
129 pSrcB += 4;
130
131 vecSrcA = vldrwq_z_s32(pSrcA, p);
132 vecSrcB = vldrwq_z_s32(pSrcB, p);
133
134 vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
135 vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
136
137 vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
138 vstrwq_p_s32(pDst, vecDst, p);
139 pDst += 4;
140
141 blkCnt -= 4;
142 }
143 while ((int32_t) blkCnt > 0);
144 } else {
145 blkCnt = numSamples * CMPLX_DIM;
146 while (blkCnt > 0) {
147 mve_pred16_t p = vctp32q(blkCnt);
148
149 vecSrcA = vldrwq_z_s32(pSrcA, p);
150 vecSrcB = vldrwq_z_s32(pSrcB, p);
151
152 vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
153 vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
154
155 vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
156 vstrwq_p_s32(pDst, vecDst, p);
157
158 pDst += 4;
159 pSrcA += 4;
160 pSrcB += 4;
161
162 blkCnt -= 4;
163 }
164 }
165 }
166 #else
arm_cmplx_mult_cmplx_q31(const q31_t * pSrcA,const q31_t * pSrcB,q31_t * pDst,uint32_t numSamples)167 void arm_cmplx_mult_cmplx_q31(
168 const q31_t * pSrcA,
169 const q31_t * pSrcB,
170 q31_t * pDst,
171 uint32_t numSamples)
172 {
173 uint32_t blkCnt; /* Loop counter */
174 q31_t a, b, c, d; /* Temporary variables */
175
176 #if defined (ARM_MATH_LOOPUNROLL)
177
178 /* Loop unrolling: Compute 4 outputs at a time */
179 blkCnt = numSamples >> 2U;
180
181 while (blkCnt > 0U)
182 {
183 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
184 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
185
186 a = *pSrcA++;
187 b = *pSrcA++;
188 c = *pSrcB++;
189 d = *pSrcB++;
190 /* store result in 3.29 format in destination buffer. */
191 *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
192 *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
193
194 a = *pSrcA++;
195 b = *pSrcA++;
196 c = *pSrcB++;
197 d = *pSrcB++;
198 *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
199 *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
200
201 a = *pSrcA++;
202 b = *pSrcA++;
203 c = *pSrcB++;
204 d = *pSrcB++;
205 *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
206 *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
207
208 a = *pSrcA++;
209 b = *pSrcA++;
210 c = *pSrcB++;
211 d = *pSrcB++;
212 *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
213 *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
214
215 /* Decrement loop counter */
216 blkCnt--;
217 }
218
219 /* Loop unrolling: Compute remaining outputs */
220 blkCnt = numSamples % 0x4U;
221
222 #else
223
224 /* Initialize blkCnt with number of samples */
225 blkCnt = numSamples;
226
227 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
228
229 while (blkCnt > 0U)
230 {
231 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
232 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
233
234 a = *pSrcA++;
235 b = *pSrcA++;
236 c = *pSrcB++;
237 d = *pSrcB++;
238
239 /* store result in 3.29 format in destination buffer. */
240 *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
241 *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
242
243 /* Decrement loop counter */
244 blkCnt--;
245 }
246
247 }
248 #endif /* defined(ARM_MATH_MVEI) */
249
250 /**
251 @} end of CmplxByCmplxMult group
252 */
253