1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_dot_prod_q31.c
4  * Description:  Q31 complex dot product
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions.h"
30 
31 /**
32   @ingroup groupCmplxMath
33  */
34 
35 /**
36   @addtogroup cmplx_dot_prod
37   @{
38  */
39 
40 /**
41   @brief         Q31 complex dot product.
42   @param[in]     pSrcA       points to the first input vector
43   @param[in]     pSrcB       points to the second input vector
44   @param[in]     numSamples  number of samples in each vector
45   @param[out]    realResult  real part of the result returned here
46   @param[out]    imagResult  imaginary part of the result returned here
47 
48   @par           Scaling and Overflow Behavior
49                    The function is implemented using an internal 64-bit accumulator.
50                    The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
51                    The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
52                    Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
53                    The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
54                    Input down scaling is not required.
55  */
56 
57 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
58 
arm_cmplx_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t numSamples,q63_t * realResult,q63_t * imagResult)59 ARM_DSP_ATTRIBUTE void arm_cmplx_dot_prod_q31(
60   const q31_t * pSrcA,
61   const q31_t * pSrcB,
62         uint32_t numSamples,
63         q63_t * realResult,
64         q63_t * imagResult)
65 {
66     int32_t         blkCnt;
67     q63_t           accReal = 0LL;
68     q63_t           accImag = 0LL;
69     q31x4_t         vecSrcA, vecSrcB;
70     q31x4_t         vecSrcC, vecSrcD;
71 
72     blkCnt = numSamples >> 2;
73     blkCnt -= 1;
74     if (blkCnt > 0) {
75         /* should give more freedom to generate stall free code */
76         vecSrcA = vld1q(pSrcA);
77         vecSrcB = vld1q(pSrcB);
78         pSrcA += 4;
79         pSrcB += 4;
80 
81         while (blkCnt > 0) {
82 
83             accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
84             vecSrcC = vld1q(pSrcA);
85             pSrcA += 4;
86 
87             accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
88             vecSrcD = vld1q(pSrcB);
89             pSrcB += 4;
90 
91             accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
92             vecSrcA = vld1q(pSrcA);
93             pSrcA += 4;
94 
95             accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
96             vecSrcB = vld1q(pSrcB);
97             pSrcB += 4;
98             /*
99              * Decrement the blockSize loop counter
100              */
101             blkCnt--;
102         }
103 
104         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
105         accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
106         vecSrcC = vld1q(pSrcA);
107 
108         accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
109         vecSrcD = vld1q(pSrcB);
110 
111         accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
112         vecSrcA = vld1q(pSrcA);
113 
114         accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
115         vecSrcB = vld1q(pSrcB);
116 
117         /*
118          * tail
119          */
120         blkCnt = CMPLX_DIM * (numSamples & 3);
121         do {
122             mve_pred16_t    p = vctp32q(blkCnt);
123 
124             pSrcA += 4;
125             pSrcB += 4;
126 
127             vecSrcA = vldrwq_z_s32(pSrcA, p);
128             vecSrcB = vldrwq_z_s32(pSrcB, p);
129 
130             accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
131             accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
132 
133             blkCnt -= 4;
134         }
135         while ((int32_t) blkCnt > 0);
136     } else {
137         blkCnt = numSamples * CMPLX_DIM;
138         while (blkCnt > 0) {
139             mve_pred16_t    p = vctp32q(blkCnt);
140 
141             vecSrcA = vldrwq_z_s32(pSrcA, p);
142             vecSrcB = vldrwq_z_s32(pSrcB, p);
143 
144             accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
145             accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
146 
147             /*
148              * Decrement the blkCnt loop counter
149              * Advance vector source and destination pointers
150              */
151             pSrcA += 4;
152             pSrcB += 4;
153             blkCnt -= 4;
154         }
155     }
156     *realResult = asrl(accReal, (14 - 8));
157     *imagResult = asrl(accImag, (14 - 8));
158 
159 }
160 
161 #else
arm_cmplx_dot_prod_q31(const q31_t * pSrcA,const q31_t * pSrcB,uint32_t numSamples,q63_t * realResult,q63_t * imagResult)162 ARM_DSP_ATTRIBUTE void arm_cmplx_dot_prod_q31(
163   const q31_t * pSrcA,
164   const q31_t * pSrcB,
165         uint32_t numSamples,
166         q63_t * realResult,
167         q63_t * imagResult)
168 {
169         uint32_t blkCnt;                               /* Loop counter */
170         q63_t real_sum = 0, imag_sum = 0;              /* Temporary result variables */
171         q31_t a0,b0,c0,d0;
172 
173 #if defined (ARM_MATH_LOOPUNROLL)
174 
175   /* Loop unrolling: Compute 4 outputs at a time */
176   blkCnt = numSamples >> 2U;
177 
178   while (blkCnt > 0U)
179   {
180     a0 = *pSrcA++;
181     b0 = *pSrcA++;
182     c0 = *pSrcB++;
183     d0 = *pSrcB++;
184 
185     real_sum += ((q63_t)a0 * c0) >> 14;
186     imag_sum += ((q63_t)a0 * d0) >> 14;
187     real_sum -= ((q63_t)b0 * d0) >> 14;
188     imag_sum += ((q63_t)b0 * c0) >> 14;
189 
190     a0 = *pSrcA++;
191     b0 = *pSrcA++;
192     c0 = *pSrcB++;
193     d0 = *pSrcB++;
194 
195     real_sum += ((q63_t)a0 * c0) >> 14;
196     imag_sum += ((q63_t)a0 * d0) >> 14;
197     real_sum -= ((q63_t)b0 * d0) >> 14;
198     imag_sum += ((q63_t)b0 * c0) >> 14;
199 
200     a0 = *pSrcA++;
201     b0 = *pSrcA++;
202     c0 = *pSrcB++;
203     d0 = *pSrcB++;
204 
205     real_sum += ((q63_t)a0 * c0) >> 14;
206     imag_sum += ((q63_t)a0 * d0) >> 14;
207     real_sum -= ((q63_t)b0 * d0) >> 14;
208     imag_sum += ((q63_t)b0 * c0) >> 14;
209 
210     a0 = *pSrcA++;
211     b0 = *pSrcA++;
212     c0 = *pSrcB++;
213     d0 = *pSrcB++;
214 
215     real_sum += ((q63_t)a0 * c0) >> 14;
216     imag_sum += ((q63_t)a0 * d0) >> 14;
217     real_sum -= ((q63_t)b0 * d0) >> 14;
218     imag_sum += ((q63_t)b0 * c0) >> 14;
219 
220     /* Decrement loop counter */
221     blkCnt--;
222   }
223 
224   /* Loop unrolling: Compute remaining outputs */
225   blkCnt = numSamples % 0x4U;
226 
227 #else
228 
229   /* Initialize blkCnt with number of samples */
230   blkCnt = numSamples;
231 
232 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
233 
234   while (blkCnt > 0U)
235   {
236     a0 = *pSrcA++;
237     b0 = *pSrcA++;
238     c0 = *pSrcB++;
239     d0 = *pSrcB++;
240 
241     real_sum += ((q63_t)a0 * c0) >> 14;
242     imag_sum += ((q63_t)a0 * d0) >> 14;
243     real_sum -= ((q63_t)b0 * d0) >> 14;
244     imag_sum += ((q63_t)b0 * c0) >> 14;
245 
246     /* Decrement loop counter */
247     blkCnt--;
248   }
249 
250   /* Store real and imaginary result in 16.48 format  */
251   *realResult = real_sum;
252   *imagResult = imag_sum;
253 }
254 #endif /* defined(ARM_MATH_MVEI) */
255 
256 /**
257   @} end of cmplx_dot_prod group
258  */
259