1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_mult_cmplx_q31.c
4  * Description:  Q31 complex-by-complex multiplication
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions.h"
30 
31 /**
32   @ingroup groupCmplxMath
33  */
34 
35 /**
36   @addtogroup CmplxByCmplxMult
37   @{
38  */
39 
40 /**
41   @brief         Q31 complex-by-complex multiplication.
42   @param[in]     pSrcA       points to first input vector
43   @param[in]     pSrcB       points to second input vector
44   @param[out]    pDst        points to output vector
45   @param[in]     numSamples  number of samples in each vector
46 
47   @par           Scaling and Overflow Behavior
48                    The function implements 1.31 by 1.31 multiplications and finally output is converted into 3.29 format.
49                    Input down scaling is not required.
50  */
51 
52 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_cmplx_mult_cmplx_q31(const q31_t * pSrcA,const q31_t * pSrcB,q31_t * pDst,uint32_t numSamples)53 void arm_cmplx_mult_cmplx_q31(
54   const q31_t * pSrcA,
55   const q31_t * pSrcB,
56         q31_t * pDst,
57         uint32_t numSamples)
58 {
59     int32_t         blkCnt;
60     q31x4_t         vecSrcA, vecSrcB;
61     q31x4_t         vecSrcC, vecSrcD;
62     q31x4_t         vecDst;
63 
64     blkCnt = numSamples >> 2;
65     blkCnt -= 1;
66     if (blkCnt > 0) {
67         /* should give more freedom to generate stall free code */
68         vecSrcA = vld1q(pSrcA);
69         vecSrcB = vld1q(pSrcB);
70         pSrcA += 4;
71         pSrcB += 4;
72 
73         while (blkCnt > 0) {
74 
75             /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
76             vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
77             vecSrcC = vld1q(pSrcA);
78             pSrcA += 4;
79 
80             /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
81             vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
82             vecSrcD = vld1q(pSrcB);
83             pSrcB += 4;
84 
85             vst1q(pDst, vshrq(vecDst, 2));
86             pDst += 4;
87 
88             vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
89             vecSrcA = vld1q(pSrcA);
90             pSrcA += 4;
91 
92             vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
93             vecSrcB = vld1q(pSrcB);
94             pSrcB += 4;
95 
96             vst1q(pDst, vshrq(vecDst, 2));
97             pDst += 4;
98 
99             /*
100              * Decrement the blockSize loop counter
101              */
102             blkCnt--;
103         }
104 
105         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
106         vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcA, vecSrcB);
107         vecSrcC = vld1q(pSrcA);
108 
109         vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
110         vecSrcD = vld1q(pSrcB);
111 
112         vst1q(pDst, vshrq(vecDst, 2));
113         pDst += 4;
114 
115         vecDst = vqdmlsdhq(vuninitializedq_s32(), vecSrcC, vecSrcD);
116         vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
117 
118         vst1q(pDst, vshrq(vecDst, 2));
119         pDst += 4;
120 
121         /*
122          * tail
123          */
124         blkCnt = CMPLX_DIM * (numSamples & 3);
125         do {
126             mve_pred16_t    p = vctp32q(blkCnt);
127 
128             pSrcA += 4;
129             pSrcB += 4;
130 
131             vecSrcA = vldrwq_z_s32(pSrcA, p);
132             vecSrcB = vldrwq_z_s32(pSrcB, p);
133 
134             vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
135             vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
136 
137             vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
138             vstrwq_p_s32(pDst, vecDst, p);
139             pDst += 4;
140 
141             blkCnt -= 4;
142         }
143         while ((int32_t) blkCnt > 0);
144     } else {
145         blkCnt = numSamples * CMPLX_DIM;
146         while (blkCnt > 0) {
147             mve_pred16_t    p = vctp32q(blkCnt);
148 
149             vecSrcA = vldrwq_z_s32(pSrcA, p);
150             vecSrcB = vldrwq_z_s32(pSrcB, p);
151 
152             vecDst = vqdmlsdhq_m(vuninitializedq_s32(), vecSrcA, vecSrcB, p);
153             vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
154 
155             vecDst = vshrq_m(vuninitializedq_s32(), vecDst, 2, p);
156             vstrwq_p_s32(pDst, vecDst, p);
157 
158             pDst += 4;
159             pSrcA += 4;
160             pSrcB += 4;
161 
162             blkCnt -= 4;
163         }
164     }
165 }
166 #else
arm_cmplx_mult_cmplx_q31(const q31_t * pSrcA,const q31_t * pSrcB,q31_t * pDst,uint32_t numSamples)167 void arm_cmplx_mult_cmplx_q31(
168   const q31_t * pSrcA,
169   const q31_t * pSrcB,
170         q31_t * pDst,
171         uint32_t numSamples)
172 {
173         uint32_t blkCnt;                               /* Loop counter */
174         q31_t a, b, c, d;                              /* Temporary variables */
175 
176 #if defined (ARM_MATH_LOOPUNROLL)
177 
178   /* Loop unrolling: Compute 4 outputs at a time */
179   blkCnt = numSamples >> 2U;
180 
181   while (blkCnt > 0U)
182   {
183     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
184     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
185 
186     a = *pSrcA++;
187     b = *pSrcA++;
188     c = *pSrcB++;
189     d = *pSrcB++;
190     /* store result in 3.29 format in destination buffer. */
191     *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
192     *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
193 
194     a = *pSrcA++;
195     b = *pSrcA++;
196     c = *pSrcB++;
197     d = *pSrcB++;
198     *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
199     *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
200 
201     a = *pSrcA++;
202     b = *pSrcA++;
203     c = *pSrcB++;
204     d = *pSrcB++;
205     *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
206     *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
207 
208     a = *pSrcA++;
209     b = *pSrcA++;
210     c = *pSrcB++;
211     d = *pSrcB++;
212     *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
213     *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
214 
215     /* Decrement loop counter */
216     blkCnt--;
217   }
218 
219   /* Loop unrolling: Compute remaining outputs */
220   blkCnt = numSamples % 0x4U;
221 
222 #else
223 
224   /* Initialize blkCnt with number of samples */
225   blkCnt = numSamples;
226 
227 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
228 
229   while (blkCnt > 0U)
230   {
231     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
232     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
233 
234     a = *pSrcA++;
235     b = *pSrcA++;
236     c = *pSrcB++;
237     d = *pSrcB++;
238 
239     /* store result in 3.29 format in destination buffer. */
240     *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
241     *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
242 
243     /* Decrement loop counter */
244     blkCnt--;
245   }
246 
247 }
248 #endif /* defined(ARM_MATH_MVEI) */
249 
250 /**
251   @} end of CmplxByCmplxMult group
252  */
253