1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_dot_prod_f16.c
4 * Description: Floating-point complex dot product
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32
33
34 /**
35 @ingroup groupCmplxMath
36 */
37
38
39 /**
40 @addtogroup cmplx_dot_prod
41 @{
42 */
43
44 /**
45 @brief Floating-point complex dot product.
46 @param[in] pSrcA points to the first input vector
47 @param[in] pSrcB points to the second input vector
48 @param[in] numSamples number of samples in each vector
49 @param[out] realResult real part of the result returned here
50 @param[out] imagResult imaginary part of the result returned here
51 */
52
53 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
54
55 #include "arm_helium_utils.h"
56
arm_cmplx_dot_prod_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t numSamples,float16_t * realResult,float16_t * imagResult)57 void arm_cmplx_dot_prod_f16(
58 const float16_t * pSrcA,
59 const float16_t * pSrcB,
60 uint32_t numSamples,
61 float16_t * realResult,
62 float16_t * imagResult)
63 {
64 int32_t blkCnt;
65 float16_t real_sum, imag_sum;
66 f16x8_t vecSrcA, vecSrcB;
67 f16x8_t vec_acc = vdupq_n_f16(0.0f16);
68 f16x8_t vecSrcC, vecSrcD;
69
70 blkCnt = (numSamples >> 3);
71 blkCnt -= 1;
72 if (blkCnt > 0) {
73 /* should give more freedom to generate stall free code */
74 vecSrcA = vld1q( pSrcA);
75 vecSrcB = vld1q( pSrcB);
76 pSrcA += 8;
77 pSrcB += 8;
78
79 while (blkCnt > 0) {
80 vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
81 vecSrcC = vld1q(pSrcA);
82 pSrcA += 8;
83
84 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
85 vecSrcD = vld1q(pSrcB);
86 pSrcB += 8;
87
88 vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
89 vecSrcA = vld1q(pSrcA);
90 pSrcA += 8;
91
92 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
93 vecSrcB = vld1q(pSrcB);
94 pSrcB += 8;
95 /*
96 * Decrement the blockSize loop counter
97 */
98 blkCnt--;
99 }
100
101 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
102 vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
103 vecSrcC = vld1q(pSrcA);
104
105 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
106 vecSrcD = vld1q(pSrcB);
107
108 vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD);
109 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
110
111 /*
112 * tail
113 */
114 blkCnt = CMPLX_DIM * (numSamples & 7);
115 while (blkCnt > 0) {
116 mve_pred16_t p = vctp16q(blkCnt);
117 pSrcA += 8;
118 pSrcB += 8;
119
120 vecSrcA = vldrhq_z_f16(pSrcA, p);
121 vecSrcB = vldrhq_z_f16(pSrcB, p);
122 vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
123 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
124
125 blkCnt -= 8;
126 }
127 } else {
128 /* small vector */
129 blkCnt = numSamples * CMPLX_DIM;
130 vec_acc = vdupq_n_f16(0.0f16);
131
132 do {
133 mve_pred16_t p = vctp16q(blkCnt);
134
135 vecSrcA = vldrhq_z_f16(pSrcA, p);
136 vecSrcB = vldrhq_z_f16(pSrcB, p);
137
138 vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p);
139 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
140
141 /*
142 * Decrement the blkCnt loop counter
143 * Advance vector source and destination pointers
144 */
145 pSrcA += 8;
146 pSrcB += 8;
147 blkCnt -= 8;
148 }
149 while (blkCnt > 0);
150 }
151
152 /* Sum the partial parts */
153 mve_cmplx_sum_intra_r_i_f16(vec_acc, real_sum, imag_sum);
154
155 /*
156 * Store the real and imaginary results in the destination buffers
157 */
158 *realResult = real_sum;
159 *imagResult = imag_sum;
160 }
161
162 #else
arm_cmplx_dot_prod_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t numSamples,float16_t * realResult,float16_t * imagResult)163 void arm_cmplx_dot_prod_f16(
164 const float16_t * pSrcA,
165 const float16_t * pSrcB,
166 uint32_t numSamples,
167 float16_t * realResult,
168 float16_t * imagResult)
169 {
170 uint32_t blkCnt; /* Loop counter */
171 _Float16 real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result variables */
172 _Float16 a0,b0,c0,d0;
173
174 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
175
176 /* Loop unrolling: Compute 4 outputs at a time */
177 blkCnt = numSamples >> 2U;
178
179 while (blkCnt > 0U)
180 {
181 a0 = *pSrcA++;
182 b0 = *pSrcA++;
183 c0 = *pSrcB++;
184 d0 = *pSrcB++;
185
186 real_sum += a0 * c0;
187 imag_sum += a0 * d0;
188 real_sum -= b0 * d0;
189 imag_sum += b0 * c0;
190
191 a0 = *pSrcA++;
192 b0 = *pSrcA++;
193 c0 = *pSrcB++;
194 d0 = *pSrcB++;
195
196 real_sum += a0 * c0;
197 imag_sum += a0 * d0;
198 real_sum -= b0 * d0;
199 imag_sum += b0 * c0;
200
201 a0 = *pSrcA++;
202 b0 = *pSrcA++;
203 c0 = *pSrcB++;
204 d0 = *pSrcB++;
205
206 real_sum += a0 * c0;
207 imag_sum += a0 * d0;
208 real_sum -= b0 * d0;
209 imag_sum += b0 * c0;
210
211 a0 = *pSrcA++;
212 b0 = *pSrcA++;
213 c0 = *pSrcB++;
214 d0 = *pSrcB++;
215
216 real_sum += a0 * c0;
217 imag_sum += a0 * d0;
218 real_sum -= b0 * d0;
219 imag_sum += b0 * c0;
220
221 /* Decrement loop counter */
222 blkCnt--;
223 }
224
225 /* Loop unrolling: Compute remaining outputs */
226 blkCnt = numSamples % 0x4U;
227
228 #else
229
230 /* Initialize blkCnt with number of samples */
231 blkCnt = numSamples;
232
233 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
234
235 while (blkCnt > 0U)
236 {
237 a0 = *pSrcA++;
238 b0 = *pSrcA++;
239 c0 = *pSrcB++;
240 d0 = *pSrcB++;
241
242 real_sum += a0 * c0;
243 imag_sum += a0 * d0;
244 real_sum -= b0 * d0;
245 imag_sum += b0 * c0;
246
247 /* Decrement loop counter */
248 blkCnt--;
249 }
250
251 /* Store real and imaginary result in destination buffer. */
252 *realResult = real_sum;
253 *imagResult = imag_sum;
254 }
255 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
256
257 /**
258 @} end of cmplx_dot_prod group
259 */
260
261 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
262