1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mult_cmplx_f16.c
4 * Description: Floating-point complex-by-complex multiplication
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32
33 /**
34 @ingroup groupCmplxMath
35 */
36
37
38
39 /**
40 @addtogroup CmplxByCmplxMult
41 @{
42 */
43
44 /**
45 @brief Floating-point complex-by-complex multiplication.
46 @param[in] pSrcA points to first input vector
47 @param[in] pSrcB points to second input vector
48 @param[out] pDst points to output vector
49 @param[in] numSamples number of samples in each vector
50 */
51
52 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
53
arm_cmplx_mult_cmplx_f16(const float16_t * pSrcA,const float16_t * pSrcB,float16_t * pDst,uint32_t numSamples)54 ARM_DSP_ATTRIBUTE void arm_cmplx_mult_cmplx_f16(
55 const float16_t * pSrcA,
56 const float16_t * pSrcB,
57 float16_t * pDst,
58 uint32_t numSamples)
59 {
60 int32_t blkCnt;
61 f16x8_t vecSrcA, vecSrcB;
62 f16x8_t vecSrcC, vecSrcD;
63 f16x8_t vec_acc;
64
65 blkCnt = (numSamples >> 3);
66 blkCnt -= 1;
67 if (blkCnt > 0) {
68 /* should give more freedom to generate stall free code */
69 vecSrcA = vld1q(pSrcA);
70 vecSrcB = vld1q(pSrcB);
71 pSrcA += 8;
72 pSrcB += 8;
73
74 while (blkCnt > 0) {
75 vec_acc = vcmulq(vecSrcA, vecSrcB);
76 vecSrcC = vld1q(pSrcA);
77 pSrcA += 8;
78
79 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
80 vecSrcD = vld1q(pSrcB);
81 pSrcB += 8;
82 vst1q(pDst, vec_acc);
83 pDst += 8;
84
85 vec_acc = vcmulq(vecSrcC, vecSrcD);
86 vecSrcA = vld1q(pSrcA);
87 pSrcA += 8;
88
89 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
90 vecSrcB = vld1q(pSrcB);
91 pSrcB += 8;
92 vst1q(pDst, vec_acc);
93 pDst += 8;
94 /*
95 * Decrement the blockSize loop counter
96 */
97 blkCnt--;
98 }
99
100 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
101 vec_acc = vcmulq(vecSrcA, vecSrcB);
102 vecSrcC = vld1q(pSrcA);
103
104 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
105 vecSrcD = vld1q(pSrcB);
106 vst1q(pDst, vec_acc);
107 pDst += 8;
108
109 vec_acc = vcmulq(vecSrcC, vecSrcD);
110 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
111 vst1q(pDst, vec_acc);
112 pDst += 8;
113
114 /*
115 * tail
116 */
117 blkCnt = CMPLX_DIM * (numSamples & 7);
118 while (blkCnt > 0) {
119 mve_pred16_t p = vctp16q(blkCnt);
120 pSrcA += 8;
121 pSrcB += 8;
122
123 vecSrcA = vldrhq_z_f16(pSrcA, p);
124 vecSrcB = vldrhq_z_f16(pSrcB, p);
125 vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
126 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
127
128 vstrhq_p_f16(pDst, vec_acc, p);
129 pDst += 8;
130
131 blkCnt -= 8;
132 }
133 } else {
134 /* small vector */
135 blkCnt = numSamples * CMPLX_DIM;
136
137 do {
138 mve_pred16_t p = vctp16q(blkCnt);
139
140 vecSrcA = vldrhq_z_f16(pSrcA, p);
141 vecSrcB = vldrhq_z_f16(pSrcB, p);
142
143 vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
144 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
145 vstrhq_p_f16(pDst, vec_acc, p);
146 pDst += 8;
147
148 /*
149 * Decrement the blkCnt loop counter
150 * Advance vector source and destination pointers
151 */
152 pSrcA += 8;
153 pSrcB += 8;
154 blkCnt -= 8;
155 }
156 while (blkCnt > 0);
157 }
158
159 }
160
161
162 #else
arm_cmplx_mult_cmplx_f16(const float16_t * pSrcA,const float16_t * pSrcB,float16_t * pDst,uint32_t numSamples)163 ARM_DSP_ATTRIBUTE void arm_cmplx_mult_cmplx_f16(
164 const float16_t * pSrcA,
165 const float16_t * pSrcB,
166 float16_t * pDst,
167 uint32_t numSamples)
168 {
169 uint32_t blkCnt; /* Loop counter */
170 _Float16 a, b, c, d; /* Temporary variables to store real and imaginary values */
171
172 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
173
174 /* Loop unrolling: Compute 4 outputs at a time */
175 blkCnt = numSamples >> 2U;
176
177 while (blkCnt > 0U)
178 {
179 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
180 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
181
182 a = *pSrcA++;
183 b = *pSrcA++;
184 c = *pSrcB++;
185 d = *pSrcB++;
186 /* store result in destination buffer. */
187 *pDst++ = (a * c) - (b * d);
188 *pDst++ = (a * d) + (b * c);
189
190 a = *pSrcA++;
191 b = *pSrcA++;
192 c = *pSrcB++;
193 d = *pSrcB++;
194 *pDst++ = (a * c) - (b * d);
195 *pDst++ = (a * d) + (b * c);
196
197 a = *pSrcA++;
198 b = *pSrcA++;
199 c = *pSrcB++;
200 d = *pSrcB++;
201 *pDst++ = (a * c) - (b * d);
202 *pDst++ = (a * d) + (b * c);
203
204 a = *pSrcA++;
205 b = *pSrcA++;
206 c = *pSrcB++;
207 d = *pSrcB++;
208 *pDst++ = (a * c) - (b * d);
209 *pDst++ = (a * d) + (b * c);
210
211 /* Decrement loop counter */
212 blkCnt--;
213 }
214
215 /* Loop unrolling: Compute remaining outputs */
216 blkCnt = numSamples % 0x4U;
217
218 #else
219
220 /* Initialize blkCnt with number of samples */
221 blkCnt = numSamples;
222
223 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
224
225 while (blkCnt > 0U)
226 {
227 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
228 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
229
230 a = *pSrcA++;
231 b = *pSrcA++;
232 c = *pSrcB++;
233 d = *pSrcB++;
234
235 /* store result in destination buffer. */
236 *pDst++ = (a * c) - (b * d);
237 *pDst++ = (a * d) + (b * c);
238
239 /* Decrement loop counter */
240 blkCnt--;
241 }
242
243 }
244 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
245
246 /**
247 @} end of CmplxByCmplxMult group
248 */
249
250 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
251