1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mult_cmplx_q15.c
4 * Description: Q15 complex-by-complex multiplication
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions.h"
30
31 /**
32 @ingroup groupCmplxMath
33 */
34
35 /**
36 @addtogroup CmplxByCmplxMult
37 @{
38 */
39
40 /**
41 @brief Q15 complex-by-complex multiplication.
42 @param[in] pSrcA points to first input vector
43 @param[in] pSrcB points to second input vector
44 @param[out] pDst points to output vector
45 @param[in] numSamples number of samples in each vector
46
47 @par Scaling and Overflow Behavior
48 The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
49 */
50
51 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
52
arm_cmplx_mult_cmplx_q15(const q15_t * pSrcA,const q15_t * pSrcB,q15_t * pDst,uint32_t numSamples)53 ARM_DSP_ATTRIBUTE void arm_cmplx_mult_cmplx_q15(
54 const q15_t * pSrcA,
55 const q15_t * pSrcB,
56 q15_t * pDst,
57 uint32_t numSamples)
58 {
59 int32_t blkCnt;
60 q15x8_t vecSrcA, vecSrcB;
61 q15x8_t vecSrcC, vecSrcD;
62 q15x8_t vecDst;
63
64 blkCnt = (numSamples >> 3);
65 blkCnt -= 1;
66 if (blkCnt > 0)
67 {
68 /* should give more freedom to generate stall free code */
69 vecSrcA = vld1q(pSrcA);
70 vecSrcB = vld1q(pSrcB);
71 pSrcA += 8;
72 pSrcB += 8;
73
74 while (blkCnt > 0)
75 {
76
77 /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
78 vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
79 vecSrcC = vld1q(pSrcA);
80 pSrcA += 8;
81
82 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
83 vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
84 vecSrcD = vld1q(pSrcB);
85 pSrcB += 8;
86
87 vstrhq_s16(pDst, vshrq(vecDst, 2));
88 pDst += 8;
89
90 vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
91 vecSrcA = vld1q(pSrcA);
92 pSrcA += 8;
93
94 vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
95 vecSrcB = vld1q(pSrcB);
96 pSrcB += 8;
97
98 vstrhq_s16(pDst, vshrq(vecDst, 2));
99 pDst += 8;
100
101 /*
102 * Decrement the blockSize loop counter
103 */
104 blkCnt--;
105 }
106
107 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
108 vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
109 vecSrcC = vld1q(pSrcA);
110
111 vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
112 vecSrcD = vld1q(pSrcB);
113
114 vstrhq_s16(pDst, vshrq(vecDst, 2));
115 pDst += 8;
116
117 vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
118 vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
119
120 vstrhq_s16(pDst, vshrq(vecDst, 2));
121 pDst += 8;
122
123 /*
124 * tail
125 */
126 blkCnt = CMPLX_DIM * (numSamples & 7);
127 do
128 {
129 mve_pred16_t p = vctp16q(blkCnt);
130
131 pSrcA += 8;
132 pSrcB += 8;
133
134 vecSrcA = vldrhq_z_s16(pSrcA, p);
135 vecSrcB = vldrhq_z_s16(pSrcB, p);
136
137 vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
138 vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
139
140 vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
141 vstrhq_p_s16(pDst, vecDst, p);
142 pDst += 8;
143
144 blkCnt -= 8;
145 }
146 while ((int32_t) blkCnt > 0);
147 }
148 else
149 {
150 blkCnt = numSamples * CMPLX_DIM;
151 while (blkCnt > 0) {
152 mve_pred16_t p = vctp16q(blkCnt);
153
154 vecSrcA = vldrhq_z_s16(pSrcA, p);
155 vecSrcB = vldrhq_z_s16(pSrcB, p);
156
157 vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
158 vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
159
160 vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
161 vstrhq_p_s16(pDst, vecDst, p);
162
163 pDst += 8;
164 pSrcA += 8;
165 pSrcB += 8;
166
167 blkCnt -= 8;
168 }
169 }
170 }
171 #else
arm_cmplx_mult_cmplx_q15(const q15_t * pSrcA,const q15_t * pSrcB,q15_t * pDst,uint32_t numSamples)172 ARM_DSP_ATTRIBUTE void arm_cmplx_mult_cmplx_q15(
173 const q15_t * pSrcA,
174 const q15_t * pSrcB,
175 q15_t * pDst,
176 uint32_t numSamples)
177 {
178 uint32_t blkCnt; /* Loop counter */
179 q15_t a, b, c, d; /* Temporary variables */
180
181 #if defined (ARM_MATH_LOOPUNROLL)
182
183 /* Loop unrolling: Compute 4 outputs at a time */
184 blkCnt = numSamples >> 2U;
185
186 while (blkCnt > 0U)
187 {
188 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
189 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
190
191 a = *pSrcA++;
192 b = *pSrcA++;
193 c = *pSrcB++;
194 d = *pSrcB++;
195 /* store result in 3.13 format in destination buffer. */
196 *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
197 *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
198
199 a = *pSrcA++;
200 b = *pSrcA++;
201 c = *pSrcB++;
202 d = *pSrcB++;
203 *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
204 *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
205
206 a = *pSrcA++;
207 b = *pSrcA++;
208 c = *pSrcB++;
209 d = *pSrcB++;
210 *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
211 *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
212
213 a = *pSrcA++;
214 b = *pSrcA++;
215 c = *pSrcB++;
216 d = *pSrcB++;
217 *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
218 *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
219
220 /* Decrement loop counter */
221 blkCnt--;
222 }
223
224 /* Loop unrolling: Compute remaining outputs */
225 blkCnt = numSamples % 0x4U;
226
227 #else
228
229 /* Initialize blkCnt with number of samples */
230 blkCnt = numSamples;
231
232 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
233
234 while (blkCnt > 0U)
235 {
236 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
237 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
238
239 a = *pSrcA++;
240 b = *pSrcA++;
241 c = *pSrcB++;
242 d = *pSrcB++;
243
244 /* store result in 3.13 format in destination buffer. */
245 *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
246 *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
247
248 /* Decrement loop counter */
249 blkCnt--;
250 }
251
252 }
253 #endif /* defined(ARM_MATH_MVEI) */
254
255 /**
256 @} end of CmplxByCmplxMult group
257 */
258