1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_mult_cmplx_f16.c
4  * Description:  Floating-point complex-by-complex multiplication
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 /**
34   @ingroup groupCmplxMath
35  */
36 
37 
38 
39 /**
40   @addtogroup CmplxByCmplxMult
41   @{
42  */
43 
44 /**
45   @brief         Floating-point complex-by-complex multiplication.
46   @param[in]     pSrcA       points to first input vector
47   @param[in]     pSrcB       points to second input vector
48   @param[out]    pDst        points to output vector
49   @param[in]     numSamples  number of samples in each vector
50   @return        none
51  */
52 
53 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
54 
arm_cmplx_mult_cmplx_f16(const float16_t * pSrcA,const float16_t * pSrcB,float16_t * pDst,uint32_t numSamples)55 void arm_cmplx_mult_cmplx_f16(
56   const float16_t * pSrcA,
57   const float16_t * pSrcB,
58         float16_t * pDst,
59         uint32_t numSamples)
60 {
61      int32_t         blkCnt;
62     f16x8_t         vecSrcA, vecSrcB;
63     f16x8_t         vecSrcC, vecSrcD;
64     f16x8_t         vec_acc;
65 
66     blkCnt = (numSamples >> 3);
67     blkCnt -= 1;
68     if (blkCnt > 0) {
69         /* should give more freedom to generate stall free code */
70         vecSrcA = vld1q(pSrcA);
71         vecSrcB = vld1q(pSrcB);
72         pSrcA += 8;
73         pSrcB += 8;
74 
75         while (blkCnt > 0) {
76             vec_acc = vcmulq(vecSrcA, vecSrcB);
77             vecSrcC = vld1q(pSrcA);
78             pSrcA += 8;
79 
80             vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
81             vecSrcD = vld1q(pSrcB);
82             pSrcB += 8;
83             vst1q(pDst, vec_acc);
84             pDst += 8;
85 
86             vec_acc = vcmulq(vecSrcC, vecSrcD);
87             vecSrcA = vld1q(pSrcA);
88             pSrcA += 8;
89 
90             vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
91             vecSrcB = vld1q(pSrcB);
92             pSrcB += 8;
93             vst1q(pDst, vec_acc);
94             pDst += 8;
95             /*
96              * Decrement the blockSize loop counter
97              */
98             blkCnt--;
99         }
100 
101         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
102         vec_acc = vcmulq(vecSrcA, vecSrcB);
103         vecSrcC = vld1q(pSrcA);
104 
105         vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
106         vecSrcD = vld1q(pSrcB);
107         vst1q(pDst, vec_acc);
108         pDst += 8;
109 
110         vec_acc = vcmulq(vecSrcC, vecSrcD);
111         vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
112         vst1q(pDst, vec_acc);
113         pDst += 8;
114 
115         /*
116          * tail
117          */
118         blkCnt = CMPLX_DIM * (numSamples & 7);
119         while (blkCnt > 0) {
120             mve_pred16_t    p = vctp16q(blkCnt);
121             pSrcA += 8;
122             pSrcB += 8;
123 
124             vecSrcA = vldrhq_z_f16(pSrcA, p);
125             vecSrcB = vldrhq_z_f16(pSrcB, p);
126             vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
127             vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
128 
129             vstrhq_p_f16(pDst, vec_acc, p);
130             pDst += 8;
131 
132             blkCnt -= 8;
133         }
134     } else {
135         /* small vector */
136         blkCnt = numSamples * CMPLX_DIM;
137 
138         do {
139             mve_pred16_t    p = vctp16q(blkCnt);
140 
141             vecSrcA = vldrhq_z_f16(pSrcA, p);
142             vecSrcB = vldrhq_z_f16(pSrcB, p);
143 
144             vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
145             vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
146             vstrhq_p_f16(pDst, vec_acc, p);
147             pDst += 8;
148 
149             /*
150              * Decrement the blkCnt loop counter
151              * Advance vector source and destination pointers
152              */
153             pSrcA += 8;
154             pSrcB += 8;
155             blkCnt -= 8;
156         }
157         while (blkCnt > 0);
158     }
159 
160 }
161 
162 
163 #else
arm_cmplx_mult_cmplx_f16(const float16_t * pSrcA,const float16_t * pSrcB,float16_t * pDst,uint32_t numSamples)164 void arm_cmplx_mult_cmplx_f16(
165   const float16_t * pSrcA,
166   const float16_t * pSrcB,
167         float16_t * pDst,
168         uint32_t numSamples)
169 {
170     uint32_t blkCnt;                               /* Loop counter */
171     _Float16 a, b, c, d;  /* Temporary variables to store real and imaginary values */
172 
173 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
174 
175   /* Loop unrolling: Compute 4 outputs at a time */
176   blkCnt = numSamples >> 2U;
177 
178   while (blkCnt > 0U)
179   {
180     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
181     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
182 
183     a = *pSrcA++;
184     b = *pSrcA++;
185     c = *pSrcB++;
186     d = *pSrcB++;
187     /* store result in destination buffer. */
188     *pDst++ = (a * c) - (b * d);
189     *pDst++ = (a * d) + (b * c);
190 
191     a = *pSrcA++;
192     b = *pSrcA++;
193     c = *pSrcB++;
194     d = *pSrcB++;
195     *pDst++ = (a * c) - (b * d);
196     *pDst++ = (a * d) + (b * c);
197 
198     a = *pSrcA++;
199     b = *pSrcA++;
200     c = *pSrcB++;
201     d = *pSrcB++;
202     *pDst++ = (a * c) - (b * d);
203     *pDst++ = (a * d) + (b * c);
204 
205     a = *pSrcA++;
206     b = *pSrcA++;
207     c = *pSrcB++;
208     d = *pSrcB++;
209     *pDst++ = (a * c) - (b * d);
210     *pDst++ = (a * d) + (b * c);
211 
212     /* Decrement loop counter */
213     blkCnt--;
214   }
215 
216   /* Loop unrolling: Compute remaining outputs */
217   blkCnt = numSamples % 0x4U;
218 
219 #else
220 
221   /* Initialize blkCnt with number of samples */
222   blkCnt = numSamples;
223 
224 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
225 
226   while (blkCnt > 0U)
227   {
228     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
229     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
230 
231     a = *pSrcA++;
232     b = *pSrcA++;
233     c = *pSrcB++;
234     d = *pSrcB++;
235 
236     /* store result in destination buffer. */
237     *pDst++ = (a * c) - (b * d);
238     *pDst++ = (a * d) + (b * c);
239 
240     /* Decrement loop counter */
241     blkCnt--;
242   }
243 
244 }
245 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
246 
247 /**
248   @} end of CmplxByCmplxMult group
249  */
250 
251 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
252