1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_mult_cmplx_q15.c
4  * Description:  Q15 complex-by-complex multiplication
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions.h"
30 
31 /**
32   @ingroup groupCmplxMath
33  */
34 
35 /**
36   @addtogroup CmplxByCmplxMult
37   @{
38  */
39 
40 /**
41   @brief         Q15 complex-by-complex multiplication.
42   @param[in]     pSrcA       points to first input vector
43   @param[in]     pSrcB       points to second input vector
44   @param[out]    pDst        points to output vector
45   @param[in]     numSamples  number of samples in each vector
46 
47   @par           Scaling and Overflow Behavior
48                    The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
49  */
50 
51 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
52 
arm_cmplx_mult_cmplx_q15(const q15_t * pSrcA,const q15_t * pSrcB,q15_t * pDst,uint32_t numSamples)53 ARM_DSP_ATTRIBUTE void arm_cmplx_mult_cmplx_q15(
54   const q15_t * pSrcA,
55   const q15_t * pSrcB,
56         q15_t * pDst,
57         uint32_t numSamples)
58 {
59    int32_t         blkCnt;
60     q15x8_t         vecSrcA, vecSrcB;
61     q15x8_t         vecSrcC, vecSrcD;
62     q15x8_t         vecDst;
63 
64     blkCnt = (numSamples >> 3);
65     blkCnt -= 1;
66     if (blkCnt > 0)
67     {
68         /* should give more freedom to generate stall free code */
69         vecSrcA = vld1q(pSrcA);
70         vecSrcB = vld1q(pSrcB);
71         pSrcA += 8;
72         pSrcB += 8;
73 
74         while (blkCnt > 0)
75         {
76 
77             /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
78             vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
79             vecSrcC = vld1q(pSrcA);
80             pSrcA += 8;
81 
82             /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
83             vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
84             vecSrcD = vld1q(pSrcB);
85             pSrcB += 8;
86 
87             vstrhq_s16(pDst, vshrq(vecDst, 2));
88             pDst += 8;
89 
90             vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
91             vecSrcA = vld1q(pSrcA);
92             pSrcA += 8;
93 
94             vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
95             vecSrcB = vld1q(pSrcB);
96             pSrcB += 8;
97 
98             vstrhq_s16(pDst, vshrq(vecDst, 2));
99             pDst += 8;
100 
101             /*
102              * Decrement the blockSize loop counter
103              */
104             blkCnt--;
105         }
106 
107         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
108         vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcA, vecSrcB);
109         vecSrcC = vld1q(pSrcA);
110 
111         vecDst = vqdmladhxq(vecDst, vecSrcA, vecSrcB);
112         vecSrcD = vld1q(pSrcB);
113 
114         vstrhq_s16(pDst, vshrq(vecDst, 2));
115         pDst += 8;
116 
117         vecDst = vqdmlsdhq(vuninitializedq_s16(), vecSrcC, vecSrcD);
118         vecDst = vqdmladhxq(vecDst, vecSrcC, vecSrcD);
119 
120         vstrhq_s16(pDst, vshrq(vecDst, 2));
121         pDst += 8;
122 
123         /*
124          * tail
125          */
126         blkCnt = CMPLX_DIM * (numSamples & 7);
127         do
128         {
129             mve_pred16_t    p = vctp16q(blkCnt);
130 
131             pSrcA += 8;
132             pSrcB += 8;
133 
134             vecSrcA = vldrhq_z_s16(pSrcA, p);
135             vecSrcB = vldrhq_z_s16(pSrcB, p);
136 
137             vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
138             vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
139 
140             vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
141             vstrhq_p_s16(pDst, vecDst, p);
142             pDst += 8;
143 
144             blkCnt -= 8;
145         }
146         while ((int32_t) blkCnt > 0);
147     }
148     else
149     {
150         blkCnt = numSamples * CMPLX_DIM;
151         while (blkCnt > 0) {
152             mve_pred16_t    p = vctp16q(blkCnt);
153 
154             vecSrcA = vldrhq_z_s16(pSrcA, p);
155             vecSrcB = vldrhq_z_s16(pSrcB, p);
156 
157             vecDst = vqdmlsdhq_m(vuninitializedq_s16(), vecSrcA, vecSrcB, p);
158             vecDst = vqdmladhxq_m(vecDst, vecSrcA, vecSrcB, p);
159 
160             vecDst = vshrq_m(vuninitializedq_s16(), vecDst, 2, p);
161             vstrhq_p_s16(pDst, vecDst, p);
162 
163             pDst += 8;
164             pSrcA += 8;
165             pSrcB += 8;
166 
167             blkCnt -= 8;
168     }
169   }
170 }
171 #else
arm_cmplx_mult_cmplx_q15(const q15_t * pSrcA,const q15_t * pSrcB,q15_t * pDst,uint32_t numSamples)172 ARM_DSP_ATTRIBUTE void arm_cmplx_mult_cmplx_q15(
173   const q15_t * pSrcA,
174   const q15_t * pSrcB,
175         q15_t * pDst,
176         uint32_t numSamples)
177 {
178         uint32_t blkCnt;                               /* Loop counter */
179         q15_t a, b, c, d;                              /* Temporary variables */
180 
181 #if defined (ARM_MATH_LOOPUNROLL)
182 
183   /* Loop unrolling: Compute 4 outputs at a time */
184   blkCnt = numSamples >> 2U;
185 
186   while (blkCnt > 0U)
187   {
188     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
189     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
190 
191     a = *pSrcA++;
192     b = *pSrcA++;
193     c = *pSrcB++;
194     d = *pSrcB++;
195     /* store result in 3.13 format in destination buffer. */
196     *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
197     *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
198 
199     a = *pSrcA++;
200     b = *pSrcA++;
201     c = *pSrcB++;
202     d = *pSrcB++;
203     *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
204     *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
205 
206     a = *pSrcA++;
207     b = *pSrcA++;
208     c = *pSrcB++;
209     d = *pSrcB++;
210     *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
211     *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
212 
213     a = *pSrcA++;
214     b = *pSrcA++;
215     c = *pSrcB++;
216     d = *pSrcB++;
217     *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
218     *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
219 
220     /* Decrement loop counter */
221     blkCnt--;
222   }
223 
224   /* Loop unrolling: Compute remaining outputs */
225   blkCnt = numSamples % 0x4U;
226 
227 #else
228 
229   /* Initialize blkCnt with number of samples */
230   blkCnt = numSamples;
231 
232 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
233 
234   while (blkCnt > 0U)
235   {
236     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
237     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
238 
239     a = *pSrcA++;
240     b = *pSrcA++;
241     c = *pSrcB++;
242     d = *pSrcB++;
243 
244     /* store result in 3.13 format in destination buffer. */
245     *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
246     *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
247 
248     /* Decrement loop counter */
249     blkCnt--;
250   }
251 
252 }
253 #endif /* defined(ARM_MATH_MVEI) */
254 
255 /**
256   @} end of CmplxByCmplxMult group
257  */
258