1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_f32.c
4  * Description:  Combined Radix Decimation in Frequency CFFT Floating point processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions_f16.h"
30 #include "arm_common_tables_f16.h"
31 
32 
33 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
34 
35 #include "arm_helium_utils.h"
36 #include "arm_vec_fft.h"
37 #include "arm_mve_tables_f16.h"
38 
39 
arm_inverse_fft_length_f16(uint16_t fftLen)40 static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
41 {
42   float16_t retValue=1.0;
43 
44   switch (fftLen)
45   {
46 
47   case 4096U:
48     retValue = (float16_t)0.000244140625f;
49     break;
50 
51   case 2048U:
52     retValue = (float16_t)0.00048828125f;
53     break;
54 
55   case 1024U:
56     retValue = (float16_t)0.0009765625f;
57     break;
58 
59   case 512U:
60     retValue = (float16_t)0.001953125f;
61     break;
62 
63   case 256U:
64     retValue = (float16_t)0.00390625f;
65     break;
66 
67   case 128U:
68     retValue = (float16_t)0.0078125f;
69     break;
70 
71   case 64U:
72     retValue = (float16_t)0.015625f;
73     break;
74 
75   case 32U:
76     retValue = (float16_t)0.03125f;
77     break;
78 
79   case 16U:
80     retValue = (float16_t)0.0625f;
81     break;
82 
83 
84   default:
85     break;
86   }
87   return(retValue);
88 }
89 
90 
_arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen)91 static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen)
92 {
93     f16x8_t vecTmp0, vecTmp1;
94     f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
95     f16x8_t vecA, vecB, vecC, vecD;
96     uint32_t  blkCnt;
97     uint32_t  n1, n2;
98     uint32_t  stage = 0;
99     int32_t  iter = 1;
100     static const int32_t strides[4] =
101        { ( 0 - 16) * (int32_t)sizeof(float16_t *)
102        , ( 4 - 16) * (int32_t)sizeof(float16_t *)
103        , ( 8 - 16) * (int32_t)sizeof(float16_t *)
104        , (12 - 16) * (int32_t)sizeof(float16_t *)};
105 
106     n2 = fftLen;
107     n1 = n2;
108     n2 >>= 2u;
109     for (int k = fftLen / 4u; k > 1; k >>= 2)
110     {
111         float16_t const     *p_rearranged_twiddle_tab_stride1 =
112                             &S->rearranged_twiddle_stride1[
113                             S->rearranged_twiddle_tab_stride1_arr[stage]];
114         float16_t const     *p_rearranged_twiddle_tab_stride2 =
115                             &S->rearranged_twiddle_stride2[
116                             S->rearranged_twiddle_tab_stride2_arr[stage]];
117         float16_t const     *p_rearranged_twiddle_tab_stride3 =
118                             &S->rearranged_twiddle_stride3[
119                             S->rearranged_twiddle_tab_stride3_arr[stage]];
120         float16_t * pBase = pSrc;
121         for (int i = 0; i < iter; i++)
122         {
123             float16_t    *inA = pBase;
124             float16_t    *inB = inA + n2 * CMPLX_DIM;
125             float16_t    *inC = inB + n2 * CMPLX_DIM;
126             float16_t    *inD = inC + n2 * CMPLX_DIM;
127             float16_t const *pW1 = p_rearranged_twiddle_tab_stride1;
128             float16_t const *pW2 = p_rearranged_twiddle_tab_stride2;
129             float16_t const *pW3 = p_rearranged_twiddle_tab_stride3;
130             f16x8_t       vecW;
131 
132             blkCnt = n2 / 4;
133             /*
134              * load 2 f16 complex pair
135              */
136             vecA = vldrhq_f16(inA);
137             vecC = vldrhq_f16(inC);
138             while (blkCnt > 0U)
139             {
140                 vecB = vldrhq_f16(inB);
141                 vecD = vldrhq_f16(inD);
142 
143                 vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
144                 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
145 
146                 vecSum1 = vecB + vecD;
147                 vecDiff1 = vecB - vecD;
148                 /*
149                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
150                  */
151                 vecTmp0 = vecSum0 + vecSum1;
152                 vst1q(inA, vecTmp0);
153                 inA += 8;
154 
155                 /*
156                  * [ 1 -1 1 -1 ] * [ A B C D ]'
157                  */
158                 vecTmp0 = vecSum0 - vecSum1;
159                 /*
160                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
161                  */
162                 vecW = vld1q(pW2);
163                 pW2 += 8;
164                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
165                 vst1q(inB, vecTmp1);
166                 inB += 8;
167 
168                 /*
169                  * [ 1 -i -1 +i ] * [ A B C D ]'
170                  */
171                 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
172                 /*
173                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
174                  */
175                 vecW = vld1q(pW1);
176                 pW1 +=8;
177                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
178                 vst1q(inC, vecTmp1);
179                 inC += 8;
180 
181                 /*
182                  * [ 1 +i -1 -i ] * [ A B C D ]'
183                  */
184                 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
185                 /*
186                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
187                  */
188                 vecW = vld1q(pW3);
189                 pW3 += 8;
190                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
191                 vst1q(inD, vecTmp1);
192                 inD += 8;
193 
194                 vecA = vldrhq_f16(inA);
195                 vecC = vldrhq_f16(inC);
196 
197                 blkCnt--;
198             }
199             pBase +=  CMPLX_DIM * n1;
200         }
201         n1 = n2;
202         n2 >>= 2u;
203         iter = iter << 2;
204         stage++;
205     }
206 
207     /*
208      * start of Last stage process
209      */
210     uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
211     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
212 
213     /* load scheduling */
214     vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
215     vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
216 
217     blkCnt = (fftLen >> 4);
218     while (blkCnt > 0U)
219     {
220         vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
221         vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
222 
223         vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
224         vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
225 
226         vecSum1 = vecB + vecD;
227         vecDiff1 = vecB - vecD;
228 
229         /* pre-load for next iteration */
230         vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
231         vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
232 
233         vecTmp0 = vecSum0 + vecSum1;
234         vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
235 
236         vecTmp0 = vecSum0 - vecSum1;
237         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
238 
239         vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
240         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
241 
242         vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
243         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
244 
245         blkCnt--;
246     }
247 
248     /*
249      * End of last stage process
250      */
251 }
252 
arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen)253 static void arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S, float16_t *pSrc, uint32_t fftLen)
254 {
255     float16_t const *pCoefVec;
256     float16_t const  *pCoef = S->pTwiddle;
257     float16_t        *pIn0, *pIn1;
258     uint32_t          n2;
259     uint32_t          blkCnt;
260     f16x8_t         vecIn0, vecIn1, vecSum, vecDiff;
261     f16x8_t         vecCmplxTmp, vecTw;
262 
263 
264     n2 = fftLen >> 1;
265     pIn0 = pSrc;
266     pIn1 = pSrc + fftLen;
267     pCoefVec = pCoef;
268 
269     blkCnt = n2 / 4;
270     while (blkCnt > 0U)
271     {
272         vecIn0 = *(f16x8_t *) pIn0;
273         vecIn1 = *(f16x8_t *) pIn1;
274         vecTw = vld1q(pCoefVec);
275         pCoefVec += 8;
276 
277         vecSum = vaddq(vecIn0, vecIn1);
278         vecDiff = vsubq(vecIn0, vecIn1);
279 
280         vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
281 
282         vst1q(pIn0, vecSum);
283         pIn0 += 8;
284         vst1q(pIn1, vecCmplxTmp);
285         pIn1 += 8;
286 
287         blkCnt--;
288     }
289 
290     _arm_radix4_butterfly_f16_mve(S, pSrc, n2);
291 
292     _arm_radix4_butterfly_f16_mve(S, pSrc + fftLen, n2);
293 
294     pIn0 = pSrc;
295 }
296 
_arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen,float16_t onebyfftLen)297 static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen, float16_t onebyfftLen)
298 {
299     f16x8_t vecTmp0, vecTmp1;
300     f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
301     f16x8_t vecA, vecB, vecC, vecD;
302     uint32_t  blkCnt;
303     uint32_t  n1, n2;
304     uint32_t  stage = 0;
305     int32_t  iter = 1;
306     static const int32_t strides[4] = {
307         ( 0 - 16) * (int32_t)sizeof(q31_t *),
308         ( 4 - 16) * (int32_t)sizeof(q31_t *),
309         ( 8 - 16) * (int32_t)sizeof(q31_t *),
310         (12 - 16) * (int32_t)sizeof(q31_t *)
311     };
312 
313     n2 = fftLen;
314     n1 = n2;
315     n2 >>= 2u;
316     for (int k = fftLen / 4; k > 1; k >>= 2)
317     {
318         float16_t const *p_rearranged_twiddle_tab_stride1 =
319                 &S->rearranged_twiddle_stride1[
320                 S->rearranged_twiddle_tab_stride1_arr[stage]];
321         float16_t const *p_rearranged_twiddle_tab_stride2 =
322                 &S->rearranged_twiddle_stride2[
323                 S->rearranged_twiddle_tab_stride2_arr[stage]];
324         float16_t const *p_rearranged_twiddle_tab_stride3 =
325                 &S->rearranged_twiddle_stride3[
326                 S->rearranged_twiddle_tab_stride3_arr[stage]];
327 
328         float16_t * pBase = pSrc;
329         for (int i = 0; i < iter; i++)
330         {
331             float16_t    *inA = pBase;
332             float16_t    *inB = inA + n2 * CMPLX_DIM;
333             float16_t    *inC = inB + n2 * CMPLX_DIM;
334             float16_t    *inD = inC + n2 * CMPLX_DIM;
335             float16_t const *pW1 = p_rearranged_twiddle_tab_stride1;
336             float16_t const *pW2 = p_rearranged_twiddle_tab_stride2;
337             float16_t const *pW3 = p_rearranged_twiddle_tab_stride3;
338             f16x8_t       vecW;
339 
340             blkCnt = n2 / 4;
341             /*
342              * load 2 f32 complex pair
343              */
344             vecA = vldrhq_f16(inA);
345             vecC = vldrhq_f16(inC);
346             while (blkCnt > 0U)
347             {
348                 vecB = vldrhq_f16(inB);
349                 vecD = vldrhq_f16(inD);
350 
351                 vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
352                 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
353 
354                 vecSum1 = vecB + vecD;
355                 vecDiff1 = vecB - vecD;
356                 /*
357                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
358                  */
359                 vecTmp0 = vecSum0 + vecSum1;
360                 vst1q(inA, vecTmp0);
361                 inA += 8;
362                 /*
363                  * [ 1 -1 1 -1 ] * [ A B C D ]'
364                  */
365                 vecTmp0 = vecSum0 - vecSum1;
366                 /*
367                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
368                  */
369                 vecW = vld1q(pW2);
370                 pW2 += 8;
371                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
372                 vst1q(inB, vecTmp1);
373                 inB += 8;
374 
375                 /*
376                  * [ 1 -i -1 +i ] * [ A B C D ]'
377                  */
378                 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
379                 /*
380                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
381                  */
382                 vecW = vld1q(pW1);
383                 pW1 += 8;
384                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
385                 vst1q(inC, vecTmp1);
386                 inC += 8;
387 
388                 /*
389                  * [ 1 +i -1 -i ] * [ A B C D ]'
390                  */
391                 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
392                 /*
393                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
394                  */
395                 vecW = vld1q(pW3);
396                 pW3 += 8;
397                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
398                 vst1q(inD, vecTmp1);
399                 inD += 8;
400 
401                 vecA = vldrhq_f16(inA);
402                 vecC = vldrhq_f16(inC);
403 
404                 blkCnt--;
405             }
406             pBase +=  CMPLX_DIM * n1;
407         }
408         n1 = n2;
409         n2 >>= 2u;
410         iter = iter << 2;
411         stage++;
412     }
413 
414     /*
415      * start of Last stage process
416      */
417     uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
418     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
419 
420     /*
421      * load scheduling
422      */
423     vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
424     vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
425 
426     blkCnt = (fftLen >> 4);
427     while (blkCnt > 0U)
428     {
429         vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
430         vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
431 
432         vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
433         vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
434 
435         vecSum1 = vecB + vecD;
436         vecDiff1 = vecB - vecD;
437 
438         vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
439         vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
440 
441         vecTmp0 = vecSum0 + vecSum1;
442         vecTmp0 = vecTmp0 * onebyfftLen;
443         vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
444 
445         vecTmp0 = vecSum0 - vecSum1;
446         vecTmp0 = vecTmp0 * onebyfftLen;
447         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
448 
449         vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
450         vecTmp0 = vecTmp0 * onebyfftLen;
451         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
452 
453         vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
454         vecTmp0 = vecTmp0 * onebyfftLen;
455         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
456 
457         blkCnt--;
458     }
459 
460     /*
461      * End of last stage process
462      */
463 }
464 
arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen)465 static void arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t *pSrc, uint32_t fftLen)
466 {
467     float16_t const *pCoefVec;
468     float16_t const  *pCoef = S->pTwiddle;
469     float16_t        *pIn0, *pIn1;
470     uint32_t          n2;
471     float16_t         onebyfftLen = arm_inverse_fft_length_f16(fftLen);
472     uint32_t          blkCnt;
473     f16x8_t         vecIn0, vecIn1, vecSum, vecDiff;
474     f16x8_t         vecCmplxTmp, vecTw;
475 
476 
477     n2 = fftLen >> 1;
478     pIn0 = pSrc;
479     pIn1 = pSrc + fftLen;
480     pCoefVec = pCoef;
481 
482     blkCnt = n2 / 4;
483     while (blkCnt > 0U)
484     {
485         vecIn0 = *(f16x8_t *) pIn0;
486         vecIn1 = *(f16x8_t *) pIn1;
487         vecTw = vld1q(pCoefVec);
488         pCoefVec += 8;
489 
490         vecSum = vaddq(vecIn0, vecIn1);
491         vecDiff = vsubq(vecIn0, vecIn1);
492 
493         vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
494 
495         vst1q(pIn0, vecSum);
496         pIn0 += 8;
497         vst1q(pIn1, vecCmplxTmp);
498         pIn1 += 8;
499 
500         blkCnt--;
501     }
502 
503     _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, n2, onebyfftLen);
504 
505     _arm_radix4_butterfly_inverse_f16_mve(S, pSrc + fftLen, n2, onebyfftLen);
506 }
507 
508 
509 /**
510   @addtogroup ComplexFFTF16
511   @{
512  */
513 
514 /**
515   @brief         Processing function for the floating-point complex FFT.
516   @param[in]     S              points to an instance of the floating-point CFFT structure
517   @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
518   @param[in]     ifftFlag       flag that selects transform direction
519                    - value = 0: forward transform
520                    - value = 1: inverse transform
521   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
522                    - value = 0: disables bit reversal of output
523                    - value = 1: enables bit reversal of output
524  */
525 
526 
arm_cfft_f16(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)527 ARM_DSP_ATTRIBUTE void arm_cfft_f16(
528   const arm_cfft_instance_f16 * S,
529         float16_t * pSrc,
530         uint8_t ifftFlag,
531         uint8_t bitReverseFlag)
532 {
533         uint32_t fftLen = S->fftLen;
534 
535         if (ifftFlag == 1U) {
536 
537             switch (fftLen) {
538             case 16:
539             case 64:
540             case 256:
541             case 1024:
542             case 4096:
543                 _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
544                 break;
545 
546             case 32:
547             case 128:
548             case 512:
549             case 2048:
550                 arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
551                 break;
552             }
553         } else {
554             switch (fftLen) {
555             case 16:
556             case 64:
557             case 256:
558             case 1024:
559             case 4096:
560                 _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
561                 break;
562 
563             case 32:
564             case 128:
565             case 512:
566             case 2048:
567                 arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
568                 break;
569             }
570         }
571 
572 
573         if (bitReverseFlag)
574         {
575 
576             arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
577 
578         }
579 }
580 
581 #else
582 
583 #if defined(ARM_FLOAT16_SUPPORTED)
584 
585 extern void arm_bitreversal_16(
586         uint16_t * pSrc,
587   const uint16_t bitRevLen,
588   const uint16_t * pBitRevTable);
589 
590 
591 extern void arm_cfft_radix4by2_f16(
592     float16_t * pSrc,
593     uint32_t fftLen,
594     const float16_t * pCoef);
595 
596 extern void arm_radix4_butterfly_f16(
597         float16_t * pSrc,
598         uint16_t fftLen,
599   const float16_t * pCoef,
600         uint16_t twidCoefModifier);
601 
602 /**
603   @addtogroup ComplexFFTF16
604   @{
605  */
606 
607 /**
608   @brief         Processing function for the floating-point complex FFT.
609   @param[in]     S              points to an instance of the floating-point CFFT structure
610   @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
611   @param[in]     ifftFlag       flag that selects transform direction
612                    - value = 0: forward transform
613                    - value = 1: inverse transform
614   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
615                    - value = 0: disables bit reversal of output
616                    - value = 1: enables bit reversal of output
617  */
618 
arm_cfft_f16(const arm_cfft_instance_f16 * S,float16_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)619 ARM_DSP_ATTRIBUTE void arm_cfft_f16(
620     const arm_cfft_instance_f16 * S,
621     float16_t * p1,
622     uint8_t ifftFlag,
623     uint8_t bitReverseFlag)
624 {
625     uint32_t  L = S->fftLen, l;
626     float16_t invL, * pSrc;
627 
628     if (ifftFlag == 1U)
629     {
630         /*  Conjugate input data  */
631         pSrc = p1 + 1;
632         for(l=0; l<L; l++)
633         {
634             *pSrc = -(_Float16)*pSrc;
635             pSrc += 2;
636         }
637     }
638 
639     switch (L)
640     {
641 
642         case 16:
643         case 64:
644         case 256:
645         case 1024:
646         case 4096:
647         arm_radix4_butterfly_f16  (p1, L, (float16_t*)S->pTwiddle, 1U);
648         break;
649 
650         case 32:
651         case 128:
652         case 512:
653         case 2048:
654         arm_cfft_radix4by2_f16  ( p1, L, (float16_t*)S->pTwiddle);
655         break;
656 
657     }
658 
659     if ( bitReverseFlag )
660         arm_bitreversal_16((uint16_t*)p1, S->bitRevLength,(uint16_t*)S->pBitRevTable);
661 
662     if (ifftFlag == 1U)
663     {
664         invL = 1.0f16/(_Float16)L;
665         /*  Conjugate and scale output data */
666         pSrc = p1;
667         for(l=0; l<L; l++)
668         {
669             *pSrc++ *=   (_Float16)invL ;
670             *pSrc  = -(_Float16)(*pSrc) * (_Float16)invL;
671             pSrc++;
672         }
673     }
674 }
675 #endif /* if defined(ARM_FLOAT16_SUPPORTED) */
676 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
677 
678 /**
679   @} end of ComplexFFTF16 group
680  */
681