1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mult_cmplx_q15.c
4 * Description: Q15 complex-by-complex multiplication
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions.h"
30
31 /**
32 @ingroup groupCmplxMath
33 */
34
35 /**
36 @addtogroup CmplxByCmplxMult
37 @{
38 */
39
40 /**
41 @brief Q15 complex-by-complex multiplication.
42 @param[in] pSrcA points to first input vector
43 @param[in] pSrcB points to second input vector
44 @param[out] pDst points to output vector
45 @param[in] numSamples number of samples in each vector
46 @return none
47
48 @par Scaling and Overflow Behavior
49 The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
50 */
51
52 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
53
arm_cmplx_mult_cmplx_q15(const q15_t * pSrcA,const q15_t * pSrcB,q15_t * pDst,uint32_t numSamples)54 void arm_cmplx_mult_cmplx_q15(
55 const q15_t * pSrcA,
56 const q15_t * pSrcB,
57 q15_t * pDst,
58 uint32_t numSamples)
59 {
60 int32_t blkCnt;
61 q15x8_t vecSrcA, vecSrcB;
62 q15x8_t vecSrcC, vecSrcD;
63 q15x8_t vecDst;
64
65 blkCnt = (numSamples >> 3);
66 blkCnt -= 1;
67 if (blkCnt > 0)
68 {
69 /* should give more freedom to generate stall free code */
70 vecSrcA = vld1q(pSrcA);
71 vecSrcB = vld1q(pSrcB);
72 pSrcA += 8;
73 pSrcB += 8;
74
75 while (blkCnt > 0)
76 {
77
78 /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
79 vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
80 vecSrcC = vld1q(pSrcA);
81 pSrcA += 8;
82
83 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
84 vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
85 vecSrcD = vld1q(pSrcB);
86 pSrcB += 8;
87
88 vstrhq_s16(pDst, vshrq(vecDst, 2));
89 pDst += 8;
90
91 vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
92 vecSrcA = vld1q(pSrcA);
93 pSrcA += 8;
94
95 vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
96 vecSrcB = vld1q(pSrcB);
97 pSrcB += 8;
98
99 vstrhq_s16(pDst, vshrq(vecDst, 2));
100 pDst += 8;
101
102 /*
103 * Decrement the blockSize loop counter
104 */
105 blkCnt--;
106 }
107
108 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
109 vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
110 vecSrcC = vld1q(pSrcA);
111
112 vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
113 vecSrcD = vld1q(pSrcB);
114
115 vstrhq_s16(pDst, vshrq(vecDst, 2));
116 pDst += 8;
117
118 vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
119 vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
120
121 vstrhq_s16(pDst, vshrq(vecDst, 2));
122 pDst += 8;
123
124 /*
125 * tail
126 */
127 blkCnt = CMPLX_DIM * (numSamples & 7);
128 do
129 {
130 mve_pred16_t p = vctp16q(blkCnt);
131
132 pSrcA += 8;
133 pSrcB += 8;
134
135 vecSrcA = vldrhq_z_s16(pSrcA, p);
136 vecSrcB = vldrhq_z_s16(pSrcB, p);
137
138 vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
139 vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
140
141 vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
142 vstrhq_p_s16(pDst, vecDst, p);
143 pDst += 8;
144
145 blkCnt -= 8;
146 }
147 while ((int32_t) blkCnt > 0);
148 }
149 else
150 {
151 blkCnt = numSamples * CMPLX_DIM;
152 while (blkCnt > 0) {
153 mve_pred16_t p = vctp16q(blkCnt);
154
155 vecSrcA = vldrhq_z_s16(pSrcA, p);
156 vecSrcB = vldrhq_z_s16(pSrcB, p);
157
158 vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
159 vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
160
161 vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
162 vstrhq_p_s16(pDst, vecDst, p);
163
164 pDst += 8;
165 pSrcA += 8;
166 pSrcB += 8;
167
168 blkCnt -= 8;
169 }
170 }
171 }
172 #else
arm_cmplx_mult_cmplx_q15(const q15_t * pSrcA,const q15_t * pSrcB,q15_t * pDst,uint32_t numSamples)173 void arm_cmplx_mult_cmplx_q15(
174 const q15_t * pSrcA,
175 const q15_t * pSrcB,
176 q15_t * pDst,
177 uint32_t numSamples)
178 {
179 uint32_t blkCnt; /* Loop counter */
180 q15_t a, b, c, d; /* Temporary variables */
181
182 #if defined (ARM_MATH_LOOPUNROLL)
183
184 /* Loop unrolling: Compute 4 outputs at a time */
185 blkCnt = numSamples >> 2U;
186
187 while (blkCnt > 0U)
188 {
189 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
190 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
191
192 a = *pSrcA++;
193 b = *pSrcA++;
194 c = *pSrcB++;
195 d = *pSrcB++;
196 /* store result in 3.13 format in destination buffer. */
197 *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
198 *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
199
200 a = *pSrcA++;
201 b = *pSrcA++;
202 c = *pSrcB++;
203 d = *pSrcB++;
204 *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
205 *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
206
207 a = *pSrcA++;
208 b = *pSrcA++;
209 c = *pSrcB++;
210 d = *pSrcB++;
211 *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
212 *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
213
214 a = *pSrcA++;
215 b = *pSrcA++;
216 c = *pSrcB++;
217 d = *pSrcB++;
218 *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
219 *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
220
221 /* Decrement loop counter */
222 blkCnt--;
223 }
224
225 /* Loop unrolling: Compute remaining outputs */
226 blkCnt = numSamples % 0x4U;
227
228 #else
229
230 /* Initialize blkCnt with number of samples */
231 blkCnt = numSamples;
232
233 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
234
235 while (blkCnt > 0U)
236 {
237 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
238 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
239
240 a = *pSrcA++;
241 b = *pSrcA++;
242 c = *pSrcB++;
243 d = *pSrcB++;
244
245 /* store result in 3.13 format in destination buffer. */
246 *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
247 *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
248
249 /* Decrement loop counter */
250 blkCnt--;
251 }
252
253 }
254 #endif /* defined(ARM_MATH_MVEI) */
255
256 /**
257 @} end of CmplxByCmplxMult group
258 */
259