1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_dot_prod_q15.c
4  * Description:  Processing function for the Q15 Complex Dot product
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions.h"
30 
31 /**
32   @ingroup groupCmplxMath
33  */
34 
35 /**
36   @addtogroup cmplx_dot_prod
37   @{
38  */
39 
40 /**
41   @brief         Q15 complex dot product.
42   @param[in]     pSrcA       points to the first input vector
43   @param[in]     pSrcB       points to the second input vector
44   @param[in]     numSamples  number of samples in each vector
45   @param[out]    realResult  real part of the result returned here
46   @param[out]    imagResult  imaginary part of the result returned her
47 
48   @par           Scaling and Overflow Behavior
49                    The function is implemented using an internal 64-bit accumulator.
50                    The intermediate 1.15 by 1.15 multiplications are performed with full precision and yield a 2.30 result.
51                    These are accumulated in a 64-bit accumulator with 34.30 precision.
52                    As a final step, the accumulators are converted to 8.24 format.
53                    The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
54  */
55 
56 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_cmplx_dot_prod_q15(const q15_t * pSrcA,const q15_t * pSrcB,uint32_t numSamples,q31_t * realResult,q31_t * imagResult)57 ARM_DSP_ATTRIBUTE void arm_cmplx_dot_prod_q15(
58   const q15_t * pSrcA,
59   const q15_t * pSrcB,
60         uint32_t numSamples,
61         q31_t * realResult,
62         q31_t * imagResult)
63 {
64     int32_t         blkCnt;
65     q63_t           accReal = 0LL;
66     q63_t           accImag = 0LL;
67     q15x8_t         vecSrcA, vecSrcB;
68     q15x8_t         vecSrcC, vecSrcD;
69 
70     blkCnt = (numSamples >> 3);
71     blkCnt -= 1;
72     if (blkCnt > 0) {
73         /* should give more freedom to generate stall free code */
74         vecSrcA = vld1q(pSrcA);
75         vecSrcB = vld1q(pSrcB);
76         pSrcA += 8;
77         pSrcB += 8;
78 
79         while (blkCnt > 0) {
80 
81             accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
82             vecSrcC = vld1q(pSrcA);
83             pSrcA += 8;
84 
85             accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
86             vecSrcD = vld1q(pSrcB);
87             pSrcB += 8;
88 
89             accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
90             vecSrcA = vld1q(pSrcA);
91             pSrcA += 8;
92 
93             accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
94             vecSrcB = vld1q(pSrcB);
95             pSrcB += 8;
96             /*
97              * Decrement the blockSize loop counter
98              */
99             blkCnt--;
100         }
101 
102         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
103         accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
104         vecSrcC = vld1q(pSrcA);
105 
106         accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
107         vecSrcD = vld1q(pSrcB);
108 
109         accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
110         vecSrcA = vld1q(pSrcA);
111 
112         accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
113         vecSrcB = vld1q(pSrcB);
114 
115         /*
116          * tail
117          */
118         blkCnt = CMPLX_DIM * (numSamples & 7);
119         do {
120             mve_pred16_t    p = vctp16q(blkCnt);
121 
122             pSrcA += 8;
123             pSrcB += 8;
124 
125             vecSrcA = vldrhq_z_s16(pSrcA, p);
126             vecSrcB = vldrhq_z_s16(pSrcB, p);
127 
128             accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
129             accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
130 
131             blkCnt -= 8;
132         }
133         while ((int32_t) blkCnt > 0);
134     } else {
135         blkCnt = numSamples * CMPLX_DIM;
136         while (blkCnt > 0) {
137             mve_pred16_t    p = vctp16q(blkCnt);
138 
139             vecSrcA = vldrhq_z_s16(pSrcA, p);
140             vecSrcB = vldrhq_z_s16(pSrcB, p);
141 
142             accReal = vmlsldavaq_p(accReal, vecSrcA, vecSrcB, p);
143             accImag = vmlaldavaxq_p(accImag, vecSrcA, vecSrcB, p);
144 
145             /*
146              * Decrement the blkCnt loop counter
147              * Advance vector source and destination pointers
148              */
149             pSrcA += 8;
150             pSrcB += 8;
151             blkCnt -= 8;
152         }
153     }
154     *realResult = asrl(accReal, (14 - 8));
155     *imagResult = asrl(accImag, (14 - 8));
156 }
157 #else
arm_cmplx_dot_prod_q15(const q15_t * pSrcA,const q15_t * pSrcB,uint32_t numSamples,q31_t * realResult,q31_t * imagResult)158 ARM_DSP_ATTRIBUTE void arm_cmplx_dot_prod_q15(
159   const q15_t * pSrcA,
160   const q15_t * pSrcB,
161         uint32_t numSamples,
162         q31_t * realResult,
163         q31_t * imagResult)
164 {
165         uint32_t blkCnt;                               /* Loop counter */
166         q63_t real_sum = 0, imag_sum = 0;              /* Temporary result variables */
167         q15_t a0,b0,c0,d0;
168 
169 #if defined (ARM_MATH_LOOPUNROLL)
170   /* Loop unrolling: Compute 4 outputs at a time */
171   blkCnt = numSamples >> 2U;
172 
173   while (blkCnt > 0U)
174   {
175     a0 = *pSrcA++;
176     b0 = *pSrcA++;
177     c0 = *pSrcB++;
178     d0 = *pSrcB++;
179 
180     real_sum += (q31_t)a0 * c0;
181     imag_sum += (q31_t)a0 * d0;
182     real_sum -= (q31_t)b0 * d0;
183     imag_sum += (q31_t)b0 * c0;
184 
185     a0 = *pSrcA++;
186     b0 = *pSrcA++;
187     c0 = *pSrcB++;
188     d0 = *pSrcB++;
189 
190     real_sum += (q31_t)a0 * c0;
191     imag_sum += (q31_t)a0 * d0;
192     real_sum -= (q31_t)b0 * d0;
193     imag_sum += (q31_t)b0 * c0;
194 
195     a0 = *pSrcA++;
196     b0 = *pSrcA++;
197     c0 = *pSrcB++;
198     d0 = *pSrcB++;
199 
200     real_sum += (q31_t)a0 * c0;
201     imag_sum += (q31_t)a0 * d0;
202     real_sum -= (q31_t)b0 * d0;
203     imag_sum += (q31_t)b0 * c0;
204 
205     a0 = *pSrcA++;
206     b0 = *pSrcA++;
207     c0 = *pSrcB++;
208     d0 = *pSrcB++;
209 
210     real_sum += (q31_t)a0 * c0;
211     imag_sum += (q31_t)a0 * d0;
212     real_sum -= (q31_t)b0 * d0;
213     imag_sum += (q31_t)b0 * c0;
214 
215     /* Decrement loop counter */
216     blkCnt--;
217   }
218 
219   /* Loop unrolling: Compute remaining outputs */
220   blkCnt = numSamples % 0x4U;
221 
222 #else
223 
224   /* Initialize blkCnt with number of samples */
225   blkCnt = numSamples;
226 
227 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
228 
229   while (blkCnt > 0U)
230   {
231     a0 = *pSrcA++;
232     b0 = *pSrcA++;
233     c0 = *pSrcB++;
234     d0 = *pSrcB++;
235 
236     real_sum += (q31_t)a0 * c0;
237     imag_sum += (q31_t)a0 * d0;
238     real_sum -= (q31_t)b0 * d0;
239     imag_sum += (q31_t)b0 * c0;
240 
241     /* Decrement loop counter */
242     blkCnt--;
243   }
244 
245   /* Store real and imaginary result in 8.24 format  */
246   /* Convert real data in 34.30 to 8.24 by 6 right shifts */
247   *realResult = (q31_t) (real_sum >> 6);
248   /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
249   *imagResult = (q31_t) (imag_sum >> 6);
250 }
251 #endif /* defined(ARM_MATH_MVEI) */
252 
253 /**
254   @} end of cmplx_dot_prod group
255  */
256