1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_dot_prod_q31.c
4  * Description:  Q31 complex dot product
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions.h"
30 
31 /**
32   @ingroup groupCmplxMath
33  */
34 
35 /**
36   @addtogroup cmplx_dot_prod
37   @{
38  */
39 
40 /**
41   @brief         Q31 complex dot product.
42   @param[in]     pSrcA       points to the first input vector
43   @param[in]     pSrcB       points to the second input vector
44   @param[in]     numSamples  number of samples in each vector
45   @param[out]    realResult  real part of the result returned here
46   @param[out]    imagResult  imaginary part of the result returned here
47   @return        none
48 
49   @par           Scaling and Overflow Behavior
50                    The function is implemented using an internal 64-bit accumulator.
51                    The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
52                    The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
53                    Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
54                    The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
55                    Input down scaling is not required.
56  */
57 
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
59 
arm_cmplx_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t numSamples,q63_t * realResult,q63_t * imagResult)60 void arm_cmplx_dot_prod_q31(
61   const q31_t * pSrcA,
62   const q31_t * pSrcB,
63         uint32_t numSamples,
64         q63_t * realResult,
65         q63_t * imagResult)
66 {
67     int32_t         blkCnt;
68     q63_t           accReal = 0LL;
69     q63_t           accImag = 0LL;
70     q31x4_t         vecSrcA, vecSrcB;
71     q31x4_t         vecSrcC, vecSrcD;
72 
73     blkCnt = numSamples >> 2;
74     blkCnt -= 1;
75     if (blkCnt > 0) {
76         /* should give more freedom to generate stall free code */
77         vecSrcA = vld1q(pSrcA);
78         vecSrcB = vld1q(pSrcB);
79         pSrcA += 4;
80         pSrcB += 4;
81 
82         while (blkCnt > 0) {
83 
84             accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
85             vecSrcC = vld1q(pSrcA);
86             pSrcA += 4;
87 
88             accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
89             vecSrcD = vld1q(pSrcB);
90             pSrcB += 4;
91 
92             accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
93             vecSrcA = vld1q(pSrcA);
94             pSrcA += 4;
95 
96             accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
97             vecSrcB = vld1q(pSrcB);
98             pSrcB += 4;
99             /*
100              * Decrement the blockSize loop counter
101              */
102             blkCnt--;
103         }
104 
105         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
106         accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
107         vecSrcC = vld1q(pSrcA);
108 
109         accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
110         vecSrcD = vld1q(pSrcB);
111 
112         accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
113         vecSrcA = vld1q(pSrcA);
114 
115         accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
116         vecSrcB = vld1q(pSrcB);
117 
118         /*
119          * tail
120          */
121         blkCnt = CMPLX_DIM * (numSamples & 3);
122         do {
123             mve_pred16_t    p = vctp32q(blkCnt);
124 
125             pSrcA += 4;
126             pSrcB += 4;
127 
128             vecSrcA = vldrwq_z_s32(pSrcA, p);
129             vecSrcB = vldrwq_z_s32(pSrcB, p);
130 
131             accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
132             accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
133 
134             blkCnt -= 4;
135         }
136         while ((int32_t) blkCnt > 0);
137     } else {
138         blkCnt = numSamples * CMPLX_DIM;
139         while (blkCnt > 0) {
140             mve_pred16_t    p = vctp32q(blkCnt);
141 
142             vecSrcA = vldrwq_z_s32(pSrcA, p);
143             vecSrcB = vldrwq_z_s32(pSrcB, p);
144 
145             accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
146             accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
147 
148             /*
149              * Decrement the blkCnt loop counter
150              * Advance vector source and destination pointers
151              */
152             pSrcA += 4;
153             pSrcB += 4;
154             blkCnt -= 4;
155         }
156     }
157     *realResult = asrl(accReal, (14 - 8));
158     *imagResult = asrl(accImag, (14 - 8));
159 
160 }
161 
162 #else
arm_cmplx_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t numSamples,q63_t * realResult,q63_t * imagResult)163 void arm_cmplx_dot_prod_q31(
164   const q31_t * pSrcA,
165   const q31_t * pSrcB,
166         uint32_t numSamples,
167         q63_t * realResult,
168         q63_t * imagResult)
169 {
170         uint32_t blkCnt;                               /* Loop counter */
171         q63_t real_sum = 0, imag_sum = 0;              /* Temporary result variables */
172         q31_t a0,b0,c0,d0;
173 
174 #if defined (ARM_MATH_LOOPUNROLL)
175 
176   /* Loop unrolling: Compute 4 outputs at a time */
177   blkCnt = numSamples >> 2U;
178 
179   while (blkCnt > 0U)
180   {
181     a0 = *pSrcA++;
182     b0 = *pSrcA++;
183     c0 = *pSrcB++;
184     d0 = *pSrcB++;
185 
186     real_sum += ((q63_t)a0 * c0) >> 14;
187     imag_sum += ((q63_t)a0 * d0) >> 14;
188     real_sum -= ((q63_t)b0 * d0) >> 14;
189     imag_sum += ((q63_t)b0 * c0) >> 14;
190 
191     a0 = *pSrcA++;
192     b0 = *pSrcA++;
193     c0 = *pSrcB++;
194     d0 = *pSrcB++;
195 
196     real_sum += ((q63_t)a0 * c0) >> 14;
197     imag_sum += ((q63_t)a0 * d0) >> 14;
198     real_sum -= ((q63_t)b0 * d0) >> 14;
199     imag_sum += ((q63_t)b0 * c0) >> 14;
200 
201     a0 = *pSrcA++;
202     b0 = *pSrcA++;
203     c0 = *pSrcB++;
204     d0 = *pSrcB++;
205 
206     real_sum += ((q63_t)a0 * c0) >> 14;
207     imag_sum += ((q63_t)a0 * d0) >> 14;
208     real_sum -= ((q63_t)b0 * d0) >> 14;
209     imag_sum += ((q63_t)b0 * c0) >> 14;
210 
211     a0 = *pSrcA++;
212     b0 = *pSrcA++;
213     c0 = *pSrcB++;
214     d0 = *pSrcB++;
215 
216     real_sum += ((q63_t)a0 * c0) >> 14;
217     imag_sum += ((q63_t)a0 * d0) >> 14;
218     real_sum -= ((q63_t)b0 * d0) >> 14;
219     imag_sum += ((q63_t)b0 * c0) >> 14;
220 
221     /* Decrement loop counter */
222     blkCnt--;
223   }
224 
225   /* Loop unrolling: Compute remaining outputs */
226   blkCnt = numSamples % 0x4U;
227 
228 #else
229 
230   /* Initialize blkCnt with number of samples */
231   blkCnt = numSamples;
232 
233 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
234 
235   while (blkCnt > 0U)
236   {
237     a0 = *pSrcA++;
238     b0 = *pSrcA++;
239     c0 = *pSrcB++;
240     d0 = *pSrcB++;
241 
242     real_sum += ((q63_t)a0 * c0) >> 14;
243     imag_sum += ((q63_t)a0 * d0) >> 14;
244     real_sum -= ((q63_t)b0 * d0) >> 14;
245     imag_sum += ((q63_t)b0 * c0) >> 14;
246 
247     /* Decrement loop counter */
248     blkCnt--;
249   }
250 
251   /* Store real and imaginary result in 16.48 format  */
252   *realResult = real_sum;
253   *imagResult = imag_sum;
254 }
255 #endif /* defined(ARM_MATH_MVEI) */
256 
257 /**
258   @} end of cmplx_dot_prod group
259  */
260