1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cmplx_mult_cmplx_f16.c
4  * Description:  Floating-point complex-by-complex multiplication
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/complex_math_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 /**
34   @ingroup groupCmplxMath
35  */
36 
37 
38 
39 /**
40   @addtogroup CmplxByCmplxMult
41   @{
42  */
43 
44 /**
45   @brief         Floating-point complex-by-complex multiplication.
46   @param[in]     pSrcA       points to first input vector
47   @param[in]     pSrcB       points to second input vector
48   @param[out]    pDst        points to output vector
49   @param[in]     numSamples  number of samples in each vector
50  */
51 
52 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
53 
arm_cmplx_mult_cmplx_f16(const float16_t * pSrcA,const float16_t * pSrcB,float16_t * pDst,uint32_t numSamples)54 ARM_DSP_ATTRIBUTE void arm_cmplx_mult_cmplx_f16(
55   const float16_t * pSrcA,
56   const float16_t * pSrcB,
57         float16_t * pDst,
58         uint32_t numSamples)
59 {
60      int32_t         blkCnt;
61     f16x8_t         vecSrcA, vecSrcB;
62     f16x8_t         vecSrcC, vecSrcD;
63     f16x8_t         vec_acc;
64 
65     blkCnt = (numSamples >> 3);
66     blkCnt -= 1;
67     if (blkCnt > 0) {
68         /* should give more freedom to generate stall free code */
69         vecSrcA = vld1q(pSrcA);
70         vecSrcB = vld1q(pSrcB);
71         pSrcA += 8;
72         pSrcB += 8;
73 
74         while (blkCnt > 0) {
75             vec_acc = vcmulq(vecSrcA, vecSrcB);
76             vecSrcC = vld1q(pSrcA);
77             pSrcA += 8;
78 
79             vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
80             vecSrcD = vld1q(pSrcB);
81             pSrcB += 8;
82             vst1q(pDst, vec_acc);
83             pDst += 8;
84 
85             vec_acc = vcmulq(vecSrcC, vecSrcD);
86             vecSrcA = vld1q(pSrcA);
87             pSrcA += 8;
88 
89             vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
90             vecSrcB = vld1q(pSrcB);
91             pSrcB += 8;
92             vst1q(pDst, vec_acc);
93             pDst += 8;
94             /*
95              * Decrement the blockSize loop counter
96              */
97             blkCnt--;
98         }
99 
100         /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
101         vec_acc = vcmulq(vecSrcA, vecSrcB);
102         vecSrcC = vld1q(pSrcA);
103 
104         vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
105         vecSrcD = vld1q(pSrcB);
106         vst1q(pDst, vec_acc);
107         pDst += 8;
108 
109         vec_acc = vcmulq(vecSrcC, vecSrcD);
110         vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
111         vst1q(pDst, vec_acc);
112         pDst += 8;
113 
114         /*
115          * tail
116          */
117         blkCnt = CMPLX_DIM * (numSamples & 7);
118         while (blkCnt > 0) {
119             mve_pred16_t    p = vctp16q(blkCnt);
120             pSrcA += 8;
121             pSrcB += 8;
122 
123             vecSrcA = vldrhq_z_f16(pSrcA, p);
124             vecSrcB = vldrhq_z_f16(pSrcB, p);
125             vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
126             vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
127 
128             vstrhq_p_f16(pDst, vec_acc, p);
129             pDst += 8;
130 
131             blkCnt -= 8;
132         }
133     } else {
134         /* small vector */
135         blkCnt = numSamples * CMPLX_DIM;
136 
137         do {
138             mve_pred16_t    p = vctp16q(blkCnt);
139 
140             vecSrcA = vldrhq_z_f16(pSrcA, p);
141             vecSrcB = vldrhq_z_f16(pSrcB, p);
142 
143             vec_acc = vcmulq_m(vuninitializedq_f16(),vecSrcA, vecSrcB, p);
144             vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
145             vstrhq_p_f16(pDst, vec_acc, p);
146             pDst += 8;
147 
148             /*
149              * Decrement the blkCnt loop counter
150              * Advance vector source and destination pointers
151              */
152             pSrcA += 8;
153             pSrcB += 8;
154             blkCnt -= 8;
155         }
156         while (blkCnt > 0);
157     }
158 
159 }
160 
161 
162 #else
arm_cmplx_mult_cmplx_f16(const float16_t * pSrcA,const float16_t * pSrcB,float16_t * pDst,uint32_t numSamples)163 ARM_DSP_ATTRIBUTE void arm_cmplx_mult_cmplx_f16(
164   const float16_t * pSrcA,
165   const float16_t * pSrcB,
166         float16_t * pDst,
167         uint32_t numSamples)
168 {
169     uint32_t blkCnt;                               /* Loop counter */
170     _Float16 a, b, c, d;  /* Temporary variables to store real and imaginary values */
171 
172 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
173 
174   /* Loop unrolling: Compute 4 outputs at a time */
175   blkCnt = numSamples >> 2U;
176 
177   while (blkCnt > 0U)
178   {
179     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
180     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
181 
182     a = *pSrcA++;
183     b = *pSrcA++;
184     c = *pSrcB++;
185     d = *pSrcB++;
186     /* store result in destination buffer. */
187     *pDst++ = (a * c) - (b * d);
188     *pDst++ = (a * d) + (b * c);
189 
190     a = *pSrcA++;
191     b = *pSrcA++;
192     c = *pSrcB++;
193     d = *pSrcB++;
194     *pDst++ = (a * c) - (b * d);
195     *pDst++ = (a * d) + (b * c);
196 
197     a = *pSrcA++;
198     b = *pSrcA++;
199     c = *pSrcB++;
200     d = *pSrcB++;
201     *pDst++ = (a * c) - (b * d);
202     *pDst++ = (a * d) + (b * c);
203 
204     a = *pSrcA++;
205     b = *pSrcA++;
206     c = *pSrcB++;
207     d = *pSrcB++;
208     *pDst++ = (a * c) - (b * d);
209     *pDst++ = (a * d) + (b * c);
210 
211     /* Decrement loop counter */
212     blkCnt--;
213   }
214 
215   /* Loop unrolling: Compute remaining outputs */
216   blkCnt = numSamples % 0x4U;
217 
218 #else
219 
220   /* Initialize blkCnt with number of samples */
221   blkCnt = numSamples;
222 
223 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
224 
225   while (blkCnt > 0U)
226   {
227     /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
228     /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
229 
230     a = *pSrcA++;
231     b = *pSrcA++;
232     c = *pSrcB++;
233     d = *pSrcB++;
234 
235     /* store result in destination buffer. */
236     *pDst++ = (a * c) - (b * d);
237     *pDst++ = (a * d) + (b * c);
238 
239     /* Decrement loop counter */
240     blkCnt--;
241   }
242 
243 }
244 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
245 
246 /**
247   @} end of CmplxByCmplxMult group
248  */
249 
250 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
251