1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_mult_cmplx_q15.c
4  * Description:  Q15 complex-by-complex multiplication
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions.h"
30 
31 /**
32   @ingroup groupCmplxMath
33  */
34 
35 /**
36   @addtogroup CmplxByCmplxMult
37   @{
38  */
39 
40 /**
41   @brief         Q15 complex-by-complex multiplication.
42   @param[in]     pSrcA       points to first input vector
43   @param[in]     pSrcB       points to second input vector
44   @param[out]    pDst        points to output vector
45   @param[in]     numSamples  number of samples in each vector
46   @return        none
47 
48   @par           Scaling and Overflow Behavior
49                    The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
50  */
51 
52 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
53 
arm_cmplx_mult_cmplx_q15(const q15_t * pSrcA,const q15_t * pSrcB,q15_t * pDst,uint32_t numSamples)54 void arm_cmplx_mult_cmplx_q15(
55   const q15_t * pSrcA,
56   const q15_t * pSrcB,
57         q15_t * pDst,
58         uint32_t numSamples)
59 {
60    int32_t         blkCnt;
61     q15x8_t         vecSrcA, vecSrcB;
62     q15x8_t         vecSrcC, vecSrcD;
63     q15x8_t         vecDst;
64 
65     blkCnt = (numSamples >> 3);
66     blkCnt -= 1;
67     if (blkCnt > 0)
68     {
69         /* should give more freedom to generate stall free code */
70         vecSrcA = vld1q(pSrcA);
71         vecSrcB = vld1q(pSrcB);
72         pSrcA += 8;
73         pSrcB += 8;
74 
75         while (blkCnt > 0)
76         {
77 
78             /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
79             vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
80             vecSrcC = vld1q(pSrcA);
81             pSrcA += 8;
82 
83             /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
84             vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
85             vecSrcD = vld1q(pSrcB);
86             pSrcB += 8;
87 
88             vstrhq_s16(pDst, vshrq(vecDst, 2));
89             pDst += 8;
90 
91             vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
92             vecSrcA = vld1q(pSrcA);
93             pSrcA += 8;
94 
95             vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
96             vecSrcB = vld1q(pSrcB);
97             pSrcB += 8;
98 
99             vstrhq_s16(pDst, vshrq(vecDst, 2));
100             pDst += 8;
101 
102             /*
103              * Decrement the blockSize loop counter
104              */
105             blkCnt--;
106         }
107 
108         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
109         vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
110         vecSrcC = vld1q(pSrcA);
111 
112         vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
113         vecSrcD = vld1q(pSrcB);
114 
115         vstrhq_s16(pDst, vshrq(vecDst, 2));
116         pDst += 8;
117 
118         vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
119         vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
120 
121         vstrhq_s16(pDst, vshrq(vecDst, 2));
122         pDst += 8;
123 
124         /*
125          * tail
126          */
127         blkCnt = CMPLX_DIM * (numSamples & 7);
128         do
129         {
130             mve_pred16_t    p = vctp16q(blkCnt);
131 
132             pSrcA += 8;
133             pSrcB += 8;
134 
135             vecSrcA = vldrhq_z_s16(pSrcA, p);
136             vecSrcB = vldrhq_z_s16(pSrcB, p);
137 
138             vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
139             vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
140 
141             vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
142             vstrhq_p_s16(pDst, vecDst, p);
143             pDst += 8;
144 
145             blkCnt -= 8;
146         }
147         while ((int32_t) blkCnt > 0);
148     }
149     else
150     {
151         blkCnt = numSamples * CMPLX_DIM;
152         while (blkCnt > 0) {
153             mve_pred16_t    p = vctp16q(blkCnt);
154 
155             vecSrcA = vldrhq_z_s16(pSrcA, p);
156             vecSrcB = vldrhq_z_s16(pSrcB, p);
157 
158             vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
159             vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
160 
161             vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
162             vstrhq_p_s16(pDst, vecDst, p);
163 
164             pDst += 8;
165             pSrcA += 8;
166             pSrcB += 8;
167 
168             blkCnt -= 8;
169     }
170   }
171 }
172 #else
arm_cmplx_mult_cmplx_q15(const q15_t * pSrcA,const q15_t * pSrcB,q15_t * pDst,uint32_t numSamples)173 void arm_cmplx_mult_cmplx_q15(
174   const q15_t * pSrcA,
175   const q15_t * pSrcB,
176         q15_t * pDst,
177         uint32_t numSamples)
178 {
179         uint32_t blkCnt;                               /* Loop counter */
180         q15_t a, b, c, d;                              /* Temporary variables */
181 
182 #if defined (ARM_MATH_LOOPUNROLL)
183 
184   /* Loop unrolling: Compute 4 outputs at a time */
185   blkCnt = numSamples >> 2U;
186 
187   while (blkCnt > 0U)
188   {
189     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
190     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
191 
192     a = *pSrcA++;
193     b = *pSrcA++;
194     c = *pSrcB++;
195     d = *pSrcB++;
196     /* store result in 3.13 format in destination buffer. */
197     *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
198     *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
199 
200     a = *pSrcA++;
201     b = *pSrcA++;
202     c = *pSrcB++;
203     d = *pSrcB++;
204     *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
205     *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
206 
207     a = *pSrcA++;
208     b = *pSrcA++;
209     c = *pSrcB++;
210     d = *pSrcB++;
211     *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
212     *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
213 
214     a = *pSrcA++;
215     b = *pSrcA++;
216     c = *pSrcB++;
217     d = *pSrcB++;
218     *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
219     *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
220 
221     /* Decrement loop counter */
222     blkCnt--;
223   }
224 
225   /* Loop unrolling: Compute remaining outputs */
226   blkCnt = numSamples % 0x4U;
227 
228 #else
229 
230   /* Initialize blkCnt with number of samples */
231   blkCnt = numSamples;
232 
233 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
234 
235   while (blkCnt > 0U)
236   {
237     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
238     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
239 
240     a = *pSrcA++;
241     b = *pSrcA++;
242     c = *pSrcB++;
243     d = *pSrcB++;
244 
245     /* store result in 3.13 format in destination buffer. */
246     *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
247     *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
248 
249     /* Decrement loop counter */
250     blkCnt--;
251   }
252 
253 }
254 #endif /* defined(ARM_MATH_MVEI) */
255 
256 /**
257   @} end of CmplxByCmplxMult group
258  */
259