1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_dot_prod_f16.c
4  * Description:  Floating-point complex dot product
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 
34 /**
35   @ingroup groupCmplxMath
36  */
37 
38 
39 /**
40   @addtogroup cmplx_dot_prod
41   @{
42  */
43 
44 /**
45   @brief         Floating-point complex dot product.
46   @param[in]     pSrcA       points to the first input vector
47   @param[in]     pSrcB       points to the second input vector
48   @param[in]     numSamples  number of samples in each vector
49   @param[out]    realResult  real part of the result returned here
50   @param[out]    imagResult  imaginary part of the result returned here
51  */
52 
53 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
54 
55 #include "arm_helium_utils.h"
56 
arm_cmplx_dot_prod_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t numSamples,float16_t * realResult,float16_t * imagResult)57 void arm_cmplx_dot_prod_f16(
58     const float16_t * pSrcA,
59     const float16_t * pSrcB,
60     uint32_t numSamples,
61     float16_t * realResult,
62     float16_t * imagResult)
63 {
64     int32_t         blkCnt;
65     float16_t       real_sum, imag_sum;
66     f16x8_t         vecSrcA, vecSrcB;
67     f16x8_t         vec_acc = vdupq_n_f16(0.0f16);
68     f16x8_t         vecSrcC, vecSrcD;
69 
70     blkCnt = (numSamples >> 3);
71     blkCnt -= 1;
72     if (blkCnt > 0) {
73         /* should give more freedom to generate stall free code */
74         vecSrcA = vld1q( pSrcA);
75         vecSrcB = vld1q( pSrcB);
76         pSrcA += 8;
77         pSrcB += 8;
78 
79         while (blkCnt > 0) {
80             vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
81             vecSrcC = vld1q(pSrcA);
82             pSrcA += 8;
83 
84             vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
85             vecSrcD = vld1q(pSrcB);
86             pSrcB += 8;
87 
88             vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
89             vecSrcA = vld1q(pSrcA);
90             pSrcA += 8;
91 
92             vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
93             vecSrcB = vld1q(pSrcB);
94             pSrcB += 8;
95             /*
96              * Decrement the blockSize loop counter
97              */
98             blkCnt--;
99         }
100 
101         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
102         vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
103         vecSrcC = vld1q(pSrcA);
104 
105         vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
106         vecSrcD = vld1q(pSrcB);
107 
108         vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
109         vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
110 
111         /*
112          * tail
113          */
114         blkCnt = CMPLX_DIM * (numSamples & 7);
115         while (blkCnt > 0) {
116             mve_pred16_t    p = vctp16q(blkCnt);
117             pSrcA += 8;
118             pSrcB += 8;
119 
120             vecSrcA = vldrhq_z_f16(pSrcA, p);
121             vecSrcB = vldrhq_z_f16(pSrcB, p);
122             vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
123             vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
124 
125             blkCnt -= 8;
126         }
127     } else {
128         /* small vector */
129         blkCnt = numSamples * CMPLX_DIM;
130         vec_acc = vdupq_n_f16(0.0f16);
131 
132         do {
133             mve_pred16_t    p = vctp16q(blkCnt);
134 
135             vecSrcA = vldrhq_z_f16(pSrcA, p);
136             vecSrcB = vldrhq_z_f16(pSrcB, p);
137 
138             vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
139             vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
140 
141             /*
142              * Decrement the blkCnt loop counter
143              * Advance vector source and destination pointers
144              */
145             pSrcA += 8;
146             pSrcB += 8;
147             blkCnt -= 8;
148         }
149         while (blkCnt > 0);
150     }
151 
152     /* Sum the partial parts */
153     mve_cmplx_sum_intra_r_i_f16(vec_acc, real_sum, imag_sum);
154 
155     /*
156      * Store the real and imaginary results in the destination buffers
157      */
158     *realResult = real_sum;
159     *imagResult = imag_sum;
160 }
161 
162 #else
arm_cmplx_dot_prod_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t numSamples,float16_t * realResult,float16_t * imagResult)163 void arm_cmplx_dot_prod_f16(
164   const float16_t * pSrcA,
165   const float16_t * pSrcB,
166         uint32_t numSamples,
167         float16_t * realResult,
168         float16_t * imagResult)
169 {
170         uint32_t blkCnt;                               /* Loop counter */
171         _Float16 real_sum = 0.0f, imag_sum = 0.0f;    /* Temporary result variables */
172         _Float16 a0,b0,c0,d0;
173 
174 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
175 
176   /* Loop unrolling: Compute 4 outputs at a time */
177   blkCnt = numSamples >> 2U;
178 
179   while (blkCnt > 0U)
180   {
181     a0 = *pSrcA++;
182     b0 = *pSrcA++;
183     c0 = *pSrcB++;
184     d0 = *pSrcB++;
185 
186     real_sum += a0 * c0;
187     imag_sum += a0 * d0;
188     real_sum -= b0 * d0;
189     imag_sum += b0 * c0;
190 
191     a0 = *pSrcA++;
192     b0 = *pSrcA++;
193     c0 = *pSrcB++;
194     d0 = *pSrcB++;
195 
196     real_sum += a0 * c0;
197     imag_sum += a0 * d0;
198     real_sum -= b0 * d0;
199     imag_sum += b0 * c0;
200 
201     a0 = *pSrcA++;
202     b0 = *pSrcA++;
203     c0 = *pSrcB++;
204     d0 = *pSrcB++;
205 
206     real_sum += a0 * c0;
207     imag_sum += a0 * d0;
208     real_sum -= b0 * d0;
209     imag_sum += b0 * c0;
210 
211     a0 = *pSrcA++;
212     b0 = *pSrcA++;
213     c0 = *pSrcB++;
214     d0 = *pSrcB++;
215 
216     real_sum += a0 * c0;
217     imag_sum += a0 * d0;
218     real_sum -= b0 * d0;
219     imag_sum += b0 * c0;
220 
221     /* Decrement loop counter */
222     blkCnt--;
223   }
224 
225   /* Loop unrolling: Compute remaining outputs */
226   blkCnt = numSamples % 0x4U;
227 
228 #else
229 
230   /* Initialize blkCnt with number of samples */
231   blkCnt = numSamples;
232 
233 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
234 
235   while (blkCnt > 0U)
236   {
237     a0 = *pSrcA++;
238     b0 = *pSrcA++;
239     c0 = *pSrcB++;
240     d0 = *pSrcB++;
241 
242     real_sum += a0 * c0;
243     imag_sum += a0 * d0;
244     real_sum -= b0 * d0;
245     imag_sum += b0 * c0;
246 
247     /* Decrement loop counter */
248     blkCnt--;
249   }
250 
251   /* Store real and imaginary result in destination buffer. */
252   *realResult = real_sum;
253   *imagResult = imag_sum;
254 }
255 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
256 
257 /**
258   @} end of cmplx_dot_prod group
259  */
260 
261 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
262