1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cmplx_mult_cmplx_f16.c
4 * Description: Floating-point complex-by-complex multiplication
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/complex_math_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32
33 /**
34 @ingroup groupCmplxMath
35 */
36
37
38
39 /**
40 @addtogroup CmplxByCmplxMult
41 @{
42 */
43
44 /**
45 @brief Floating-point complex-by-complex multiplication.
46 @param[in] pSrcA points to first input vector
47 @param[in] pSrcB points to second input vector
48 @param[out] pDst points to output vector
49 @param[in] numSamples number of samples in each vector
50 @return none
51 */
52
53 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
54
arm_cmplx_mult_cmplx_f16(const float16_t * pSrcA,const float16_t * pSrcB,float16_t * pDst,uint32_t numSamples)55 void arm_cmplx_mult_cmplx_f16(
56 const float16_t * pSrcA,
57 const float16_t * pSrcB,
58 float16_t * pDst,
59 uint32_t numSamples)
60 {
61 int32_t blkCnt;
62 f16x8_t vecSrcA, vecSrcB;
63 f16x8_t vecSrcC, vecSrcD;
64 f16x8_t vec_acc;
65
66 blkCnt = (numSamples >> 3);
67 blkCnt -= 1;
68 if (blkCnt > 0) {
69 /* should give more freedom to generate stall free code */
70 vecSrcA = vld1q(pSrcA);
71 vecSrcB = vld1q(pSrcB);
72 pSrcA += 8;
73 pSrcB += 8;
74
75 while (blkCnt > 0) {
76 vec_acc = vcmulq(vecSrcA, vecSrcB);
77 vecSrcC = vld1q(pSrcA);
78 pSrcA += 8;
79
80 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
81 vecSrcD = vld1q(pSrcB);
82 pSrcB += 8;
83 vst1q(pDst, vec_acc);
84 pDst += 8;
85
86 vec_acc = vcmulq(vecSrcC, vecSrcD);
87 vecSrcA = vld1q(pSrcA);
88 pSrcA += 8;
89
90 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
91 vecSrcB = vld1q(pSrcB);
92 pSrcB += 8;
93 vst1q(pDst, vec_acc);
94 pDst += 8;
95 /*
96 * Decrement the blockSize loop counter
97 */
98 blkCnt--;
99 }
100
101 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
102 vec_acc = vcmulq(vecSrcA, vecSrcB);
103 vecSrcC = vld1q(pSrcA);
104
105 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
106 vecSrcD = vld1q(pSrcB);
107 vst1q(pDst, vec_acc);
108 pDst += 8;
109
110 vec_acc = vcmulq(vecSrcC, vecSrcD);
111 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
112 vst1q(pDst, vec_acc);
113 pDst += 8;
114
115 /*
116 * tail
117 */
118 blkCnt = CMPLX_DIM * (numSamples & 7);
119 while (blkCnt > 0) {
120 mve_pred16_t p = vctp16q(blkCnt);
121 pSrcA += 8;
122 pSrcB += 8;
123
124 vecSrcA = vldrhq_z_f16(pSrcA, p);
125 vecSrcB = vldrhq_z_f16(pSrcB, p);
126 vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
127 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
128
129 vstrhq_p_f16(pDst, vec_acc, p);
130 pDst += 8;
131
132 blkCnt -= 8;
133 }
134 } else {
135 /* small vector */
136 blkCnt = numSamples * CMPLX_DIM;
137
138 do {
139 mve_pred16_t p = vctp16q(blkCnt);
140
141 vecSrcA = vldrhq_z_f16(pSrcA, p);
142 vecSrcB = vldrhq_z_f16(pSrcB, p);
143
144 vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
145 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
146 vstrhq_p_f16(pDst, vec_acc, p);
147 pDst += 8;
148
149 /*
150 * Decrement the blkCnt loop counter
151 * Advance vector source and destination pointers
152 */
153 pSrcA += 8;
154 pSrcB += 8;
155 blkCnt -= 8;
156 }
157 while (blkCnt > 0);
158 }
159
160 }
161
162
163 #else
arm_cmplx_mult_cmplx_f16(const float16_t * pSrcA,const float16_t * pSrcB,float16_t * pDst,uint32_t numSamples)164 void arm_cmplx_mult_cmplx_f16(
165 const float16_t * pSrcA,
166 const float16_t * pSrcB,
167 float16_t * pDst,
168 uint32_t numSamples)
169 {
170 uint32_t blkCnt; /* Loop counter */
171 _Float16 a, b, c, d; /* Temporary variables to store real and imaginary values */
172
173 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
174
175 /* Loop unrolling: Compute 4 outputs at a time */
176 blkCnt = numSamples >> 2U;
177
178 while (blkCnt > 0U)
179 {
180 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
181 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
182
183 a = *pSrcA++;
184 b = *pSrcA++;
185 c = *pSrcB++;
186 d = *pSrcB++;
187 /* store result in destination buffer. */
188 *pDst++ = (a * c) - (b * d);
189 *pDst++ = (a * d) + (b * c);
190
191 a = *pSrcA++;
192 b = *pSrcA++;
193 c = *pSrcB++;
194 d = *pSrcB++;
195 *pDst++ = (a * c) - (b * d);
196 *pDst++ = (a * d) + (b * c);
197
198 a = *pSrcA++;
199 b = *pSrcA++;
200 c = *pSrcB++;
201 d = *pSrcB++;
202 *pDst++ = (a * c) - (b * d);
203 *pDst++ = (a * d) + (b * c);
204
205 a = *pSrcA++;
206 b = *pSrcA++;
207 c = *pSrcB++;
208 d = *pSrcB++;
209 *pDst++ = (a * c) - (b * d);
210 *pDst++ = (a * d) + (b * c);
211
212 /* Decrement loop counter */
213 blkCnt--;
214 }
215
216 /* Loop unrolling: Compute remaining outputs */
217 blkCnt = numSamples % 0x4U;
218
219 #else
220
221 /* Initialize blkCnt with number of samples */
222 blkCnt = numSamples;
223
224 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
225
226 while (blkCnt > 0U)
227 {
228 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
229 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
230
231 a = *pSrcA++;
232 b = *pSrcA++;
233 c = *pSrcB++;
234 d = *pSrcB++;
235
236 /* store result in destination buffer. */
237 *pDst++ = (a * c) - (b * d);
238 *pDst++ = (a * d) + (b * c);
239
240 /* Decrement loop counter */
241 blkCnt--;
242 }
243
244 }
245 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
246
247 /**
248 @} end of CmplxByCmplxMult group
249 */
250
251 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
252