1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_f32.c
4  * Description:  Combined Radix Decimation in Frequency CFFT Floating point processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions_f16.h"
30 #include "arm_common_tables_f16.h"
31 
32 
33 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
34 
35 #include "arm_helium_utils.h"
36 #include "arm_vec_fft.h"
37 #include "arm_mve_tables_f16.h"
38 
39 
arm_inverse_fft_length_f16(uint16_t fftLen)40 static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
41 {
42   float16_t retValue=1.0;
43 
44   switch (fftLen)
45   {
46 
47   case 4096U:
48     retValue = (float16_t)0.000244140625f;
49     break;
50 
51   case 2048U:
52     retValue = (float16_t)0.00048828125f;
53     break;
54 
55   case 1024U:
56     retValue = (float16_t)0.0009765625f;
57     break;
58 
59   case 512U:
60     retValue = (float16_t)0.001953125f;
61     break;
62 
63   case 256U:
64     retValue = (float16_t)0.00390625f;
65     break;
66 
67   case 128U:
68     retValue = (float16_t)0.0078125f;
69     break;
70 
71   case 64U:
72     retValue = (float16_t)0.015625f;
73     break;
74 
75   case 32U:
76     retValue = (float16_t)0.03125f;
77     break;
78 
79   case 16U:
80     retValue = (float16_t)0.0625f;
81     break;
82 
83 
84   default:
85     break;
86   }
87   return(retValue);
88 }
89 
90 
_arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen)91 static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen)
92 {
93     f16x8_t vecTmp0, vecTmp1;
94     f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
95     f16x8_t vecA, vecB, vecC, vecD;
96     uint32_t  blkCnt;
97     uint32_t  n1, n2;
98     uint32_t  stage = 0;
99     int32_t  iter = 1;
100     static const uint32_t strides[4] =
101        {(0 - 16) * sizeof(float16_t *)
102        , (4 - 16) * sizeof(float16_t *)
103        , (8 - 16) * sizeof(float16_t *)
104        , (12 - 16) * sizeof(float16_t *)};
105 
106     n2 = fftLen;
107     n1 = n2;
108     n2 >>= 2u;
109     for (int k = fftLen / 4u; k > 1; k >>= 2)
110     {
111         for (int i = 0; i < iter; i++)
112         {
113             float16_t const     *p_rearranged_twiddle_tab_stride1 =
114                                 &S->rearranged_twiddle_stride1[
115                                 S->rearranged_twiddle_tab_stride1_arr[stage]];
116             float16_t const     *p_rearranged_twiddle_tab_stride2 =
117                                 &S->rearranged_twiddle_stride2[
118                                 S->rearranged_twiddle_tab_stride2_arr[stage]];
119             float16_t const     *p_rearranged_twiddle_tab_stride3 =
120                                 &S->rearranged_twiddle_stride3[
121                                 S->rearranged_twiddle_tab_stride3_arr[stage]];
122             float16_t const    *pW1, *pW2, *pW3;
123             float16_t           *inA = pSrc + CMPLX_DIM * i * n1;
124             float16_t           *inB = inA + n2 * CMPLX_DIM;
125             float16_t           *inC = inB + n2 * CMPLX_DIM;
126             float16_t           *inD = inC + n2 * CMPLX_DIM;
127             f16x8_t            vecW;
128 
129 
130             pW1 = p_rearranged_twiddle_tab_stride1;
131             pW2 = p_rearranged_twiddle_tab_stride2;
132             pW3 = p_rearranged_twiddle_tab_stride3;
133 
134             blkCnt = n2 / 4;
135             /*
136              * load 2 f16 complex pair
137              */
138             vecA = vldrhq_f16(inA);
139             vecC = vldrhq_f16(inC);
140             while (blkCnt > 0U)
141             {
142                 vecB = vldrhq_f16(inB);
143                 vecD = vldrhq_f16(inD);
144 
145                 vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
146                 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
147 
148                 vecSum1 = vecB + vecD;
149                 vecDiff1 = vecB - vecD;
150                 /*
151                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
152                  */
153                 vecTmp0 = vecSum0 + vecSum1;
154                 vst1q(inA, vecTmp0);
155                 inA += 8;
156 
157                 /*
158                  * [ 1 -1 1 -1 ] * [ A B C D ]'
159                  */
160                 vecTmp0 = vecSum0 - vecSum1;
161                 /*
162                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
163                  */
164                 vecW = vld1q(pW2);
165                 pW2 += 8;
166                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
167                 vst1q(inB, vecTmp1);
168                 inB += 8;
169 
170                 /*
171                  * [ 1 -i -1 +i ] * [ A B C D ]'
172                  */
173                 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
174                 /*
175                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
176                  */
177                 vecW = vld1q(pW1);
178                 pW1 +=8;
179                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
180                 vst1q(inC, vecTmp1);
181                 inC += 8;
182 
183                 /*
184                  * [ 1 +i -1 -i ] * [ A B C D ]'
185                  */
186                 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
187                 /*
188                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
189                  */
190                 vecW = vld1q(pW3);
191                 pW3 += 8;
192                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
193                 vst1q(inD, vecTmp1);
194                 inD += 8;
195 
196                 vecA = vldrhq_f16(inA);
197                 vecC = vldrhq_f16(inC);
198 
199                 blkCnt--;
200             }
201         }
202         n1 = n2;
203         n2 >>= 2u;
204         iter = iter << 2;
205         stage++;
206     }
207 
208     /*
209      * start of Last stage process
210      */
211     uint32x4_t vecScGathAddr = vld1q_u32(strides);
212     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
213 
214     /* load scheduling */
215     vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
216     vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
217 
218     blkCnt = (fftLen >> 4);
219     while (blkCnt > 0U)
220     {
221         vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
222         vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
223 
224         vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
225         vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
226 
227         vecSum1 = vecB + vecD;
228         vecDiff1 = vecB - vecD;
229 
230         /* pre-load for next iteration */
231         vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
232         vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
233 
234         vecTmp0 = vecSum0 + vecSum1;
235         vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
236 
237         vecTmp0 = vecSum0 - vecSum1;
238         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
239 
240         vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
241         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
242 
243         vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
244         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
245 
246         blkCnt--;
247     }
248 
249     /*
250      * End of last stage process
251      */
252 }
253 
arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen)254 static void arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S, float16_t *pSrc, uint32_t fftLen)
255 {
256     float16_t const *pCoefVec;
257     float16_t const  *pCoef = S->pTwiddle;
258     float16_t        *pIn0, *pIn1;
259     uint32_t          n2;
260     uint32_t          blkCnt;
261     f16x8_t         vecIn0, vecIn1, vecSum, vecDiff;
262     f16x8_t         vecCmplxTmp, vecTw;
263 
264 
265     n2 = fftLen >> 1;
266     pIn0 = pSrc;
267     pIn1 = pSrc + fftLen;
268     pCoefVec = pCoef;
269 
270     blkCnt = n2 / 4;
271     while (blkCnt > 0U)
272     {
273         vecIn0 = *(f16x8_t *) pIn0;
274         vecIn1 = *(f16x8_t *) pIn1;
275         vecTw = vld1q(pCoefVec);
276         pCoefVec += 8;
277 
278         vecSum = vaddq(vecIn0, vecIn1);
279         vecDiff = vsubq(vecIn0, vecIn1);
280 
281         vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
282 
283         vst1q(pIn0, vecSum);
284         pIn0 += 8;
285         vst1q(pIn1, vecCmplxTmp);
286         pIn1 += 8;
287 
288         blkCnt--;
289     }
290 
291     _arm_radix4_butterfly_f16_mve(S, pSrc, n2);
292 
293     _arm_radix4_butterfly_f16_mve(S, pSrc + fftLen, n2);
294 
295     pIn0 = pSrc;
296 }
297 
_arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen,float16_t onebyfftLen)298 static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen, float16_t onebyfftLen)
299 {
300     f16x8_t vecTmp0, vecTmp1;
301     f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
302     f16x8_t vecA, vecB, vecC, vecD;
303     f16x8_t vecW;
304     uint32_t  blkCnt;
305     uint32_t  n1, n2;
306     uint32_t  stage = 0;
307     int32_t  iter = 1;
308     static const uint32_t strides[4] = {
309         (0 - 16) * sizeof(q31_t *),
310         (4 - 16) * sizeof(q31_t *),
311         (8 - 16) * sizeof(q31_t *),
312         (12 - 16) * sizeof(q31_t *)
313     };
314 
315     n2 = fftLen;
316     n1 = n2;
317     n2 >>= 2u;
318     for (int k = fftLen / 4; k > 1; k >>= 2)
319     {
320         for (int i = 0; i < iter; i++)
321         {
322             float16_t const *p_rearranged_twiddle_tab_stride1 =
323                     &S->rearranged_twiddle_stride1[
324                     S->rearranged_twiddle_tab_stride1_arr[stage]];
325             float16_t const *p_rearranged_twiddle_tab_stride2 =
326                     &S->rearranged_twiddle_stride2[
327                     S->rearranged_twiddle_tab_stride2_arr[stage]];
328             float16_t const *p_rearranged_twiddle_tab_stride3 =
329                     &S->rearranged_twiddle_stride3[
330                     S->rearranged_twiddle_tab_stride3_arr[stage]];
331             float16_t const *pW1, *pW2, *pW3;
332             float16_t *inA = pSrc + CMPLX_DIM * i * n1;
333             float16_t *inB = inA + n2 * CMPLX_DIM;
334             float16_t *inC = inB + n2 * CMPLX_DIM;
335             float16_t *inD = inC + n2 * CMPLX_DIM;
336 
337             pW1 = p_rearranged_twiddle_tab_stride1;
338             pW2 = p_rearranged_twiddle_tab_stride2;
339             pW3 = p_rearranged_twiddle_tab_stride3;
340 
341             blkCnt = n2 / 4;
342             /*
343              * load 2 f32 complex pair
344              */
345             vecA = vldrhq_f16(inA);
346             vecC = vldrhq_f16(inC);
347             while (blkCnt > 0U)
348             {
349                 vecB = vldrhq_f16(inB);
350                 vecD = vldrhq_f16(inD);
351 
352                 vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
353                 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
354 
355                 vecSum1 = vecB + vecD;
356                 vecDiff1 = vecB - vecD;
357                 /*
358                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
359                  */
360                 vecTmp0 = vecSum0 + vecSum1;
361                 vst1q(inA, vecTmp0);
362                 inA += 8;
363                 /*
364                  * [ 1 -1 1 -1 ] * [ A B C D ]'
365                  */
366                 vecTmp0 = vecSum0 - vecSum1;
367                 /*
368                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
369                  */
370                 vecW = vld1q(pW2);
371                 pW2 += 8;
372                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
373                 vst1q(inB, vecTmp1);
374                 inB += 8;
375 
376                 /*
377                  * [ 1 -i -1 +i ] * [ A B C D ]'
378                  */
379                 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
380                 /*
381                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
382                  */
383                 vecW = vld1q(pW1);
384                 pW1 += 8;
385                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
386                 vst1q(inC, vecTmp1);
387                 inC += 8;
388 
389                 /*
390                  * [ 1 +i -1 -i ] * [ A B C D ]'
391                  */
392                 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
393                 /*
394                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
395                  */
396                 vecW = vld1q(pW3);
397                 pW3 += 8;
398                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
399                 vst1q(inD, vecTmp1);
400                 inD += 8;
401 
402                 vecA = vldrhq_f16(inA);
403                 vecC = vldrhq_f16(inC);
404 
405                 blkCnt--;
406             }
407         }
408         n1 = n2;
409         n2 >>= 2u;
410         iter = iter << 2;
411         stage++;
412     }
413 
414     /*
415      * start of Last stage process
416      */
417     uint32x4_t vecScGathAddr = vld1q_u32(strides);
418     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
419 
420     /*
421      * load scheduling
422      */
423     vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
424     vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
425 
426     blkCnt = (fftLen >> 4);
427     while (blkCnt > 0U)
428     {
429         vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
430         vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
431 
432         vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
433         vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
434 
435         vecSum1 = vecB + vecD;
436         vecDiff1 = vecB - vecD;
437 
438         vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
439         vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
440 
441         vecTmp0 = vecSum0 + vecSum1;
442         vecTmp0 = vecTmp0 * onebyfftLen;
443         vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
444 
445         vecTmp0 = vecSum0 - vecSum1;
446         vecTmp0 = vecTmp0 * onebyfftLen;
447         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
448 
449         vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
450         vecTmp0 = vecTmp0 * onebyfftLen;
451         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
452 
453         vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
454         vecTmp0 = vecTmp0 * onebyfftLen;
455         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
456 
457         blkCnt--;
458     }
459 
460     /*
461      * End of last stage process
462      */
463 }
464 
arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen)465 static void arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t *pSrc, uint32_t fftLen)
466 {
467     float16_t const *pCoefVec;
468     float16_t const  *pCoef = S->pTwiddle;
469     float16_t        *pIn0, *pIn1;
470     uint32_t          n2;
471     float16_t         onebyfftLen = arm_inverse_fft_length_f16(fftLen);
472     uint32_t          blkCnt;
473     f16x8_t         vecIn0, vecIn1, vecSum, vecDiff;
474     f16x8_t         vecCmplxTmp, vecTw;
475 
476 
477     n2 = fftLen >> 1;
478     pIn0 = pSrc;
479     pIn1 = pSrc + fftLen;
480     pCoefVec = pCoef;
481 
482     blkCnt = n2 / 4;
483     while (blkCnt > 0U)
484     {
485         vecIn0 = *(f16x8_t *) pIn0;
486         vecIn1 = *(f16x8_t *) pIn1;
487         vecTw = vld1q(pCoefVec);
488         pCoefVec += 8;
489 
490         vecSum = vaddq(vecIn0, vecIn1);
491         vecDiff = vsubq(vecIn0, vecIn1);
492 
493         vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
494 
495         vst1q(pIn0, vecSum);
496         pIn0 += 8;
497         vst1q(pIn1, vecCmplxTmp);
498         pIn1 += 8;
499 
500         blkCnt--;
501     }
502 
503     _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, n2, onebyfftLen);
504 
505     _arm_radix4_butterfly_inverse_f16_mve(S, pSrc + fftLen, n2, onebyfftLen);
506 }
507 
508 
509 /**
510   @addtogroup ComplexFFT
511   @{
512  */
513 
514 /**
515   @brief         Processing function for the floating-point complex FFT.
516   @param[in]     S              points to an instance of the floating-point CFFT structure
517   @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
518   @param[in]     ifftFlag       flag that selects transform direction
519                    - value = 0: forward transform
520                    - value = 1: inverse transform
521   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
522                    - value = 0: disables bit reversal of output
523                    - value = 1: enables bit reversal of output
524   @return        none
525  */
526 
527 
arm_cfft_f16(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)528 void arm_cfft_f16(
529   const arm_cfft_instance_f16 * S,
530         float16_t * pSrc,
531         uint8_t ifftFlag,
532         uint8_t bitReverseFlag)
533 {
534         uint32_t fftLen = S->fftLen;
535 
536         if (ifftFlag == 1U) {
537 
538             switch (fftLen) {
539             case 16:
540             case 64:
541             case 256:
542             case 1024:
543             case 4096:
544                 _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
545                 break;
546 
547             case 32:
548             case 128:
549             case 512:
550             case 2048:
551                 arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
552                 break;
553             }
554         } else {
555             switch (fftLen) {
556             case 16:
557             case 64:
558             case 256:
559             case 1024:
560             case 4096:
561                 _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
562                 break;
563 
564             case 32:
565             case 128:
566             case 512:
567             case 2048:
568                 arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
569                 break;
570             }
571         }
572 
573 
574         if (bitReverseFlag)
575         {
576 
577             arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
578 
579         }
580 }
581 
582 #else
583 
584 #if defined(ARM_FLOAT16_SUPPORTED)
585 
586 extern void arm_bitreversal_16(
587         uint16_t * pSrc,
588   const uint16_t bitRevLen,
589   const uint16_t * pBitRevTable);
590 
591 
592 extern void arm_cfft_radix4by2_f16(
593     float16_t * pSrc,
594     uint32_t fftLen,
595     const float16_t * pCoef);
596 
597 extern void arm_radix4_butterfly_f16(
598         float16_t * pSrc,
599         uint16_t fftLen,
600   const float16_t * pCoef,
601         uint16_t twidCoefModifier);
602 
603 /**
604   @ingroup groupTransforms
605  */
606 
607 /**
608   @defgroup ComplexFFT Complex FFT Functions
609 
610   @par
611                    The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
612                    Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
613                    than the DFT, especially for long lengths.
614                    The algorithms described in this section
615                    operate on complex data.  A separate set of functions is devoted to handling
616                    of real sequences.
617   @par
618                    There are separate algorithms for handling floating-point, Q15, and Q31 data
619                    types.  The algorithms available for each data type are described next.
620   @par
621                    The FFT functions operate in-place.  That is, the array holding the input data
622                    will also be used to hold the corresponding result.  The input data is complex
623                    and contains <code>2*fftLen</code> interleaved values as shown below.
624                    <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
625                    The FFT result will be contained in the same array and the frequency domain
626                    values will have the same interleaving.
627 
628   @par Floating-point
629                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
630                    stages are performed along with a single radix-2 or radix-4 stage, as needed.
631                    The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
632                    a different twiddle factor table.
633   @par
634                    The function uses the standard FFT definition and output values may grow by a
635                    factor of <code>fftLen</code> when computing the forward transform.  The
636                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
637                    calculation and this matches the textbook definition of the inverse FFT.
638   @par
639                    For the MVE version, the new arm_cfft_init_f32 initialization function is
640                    <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
641                    needed FFTs.</b> Other FFT versions can continue to be initialized as
642                    explained below.
643   @par
644                    For not MVE versions, pre-initialized data structures containing twiddle factors
645                    and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>.  Include
646                    this header in your function and then pass one of the constant structures as
647                    an argument to arm_cfft_f32.  For example:
648   @par
649                    <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
650   @par
651                    computes a 64-point inverse complex FFT including bit reversal.
652                    The data structures are treated as constant data and not modified during the
653                    calculation.  The same data structure can be reused for multiple transforms
654                    including mixing forward and inverse transforms.
655   @par
656                    Earlier releases of the library provided separate radix-2 and radix-4
657                    algorithms that operated on floating-point data.  These functions are still
658                    provided but are deprecated.  The older functions are slower and less general
659                    than the new functions.
660   @par
661                    An example of initialization of the constants for the arm_cfft_f32 function follows:
662   @code
663                    const static arm_cfft_instance_f32 *S;
664                    ...
665                      switch (length) {
666                        case 16:
667                          S = &arm_cfft_sR_f32_len16;
668                          break;
669                        case 32:
670                          S = &arm_cfft_sR_f32_len32;
671                          break;
672                        case 64:
673                          S = &arm_cfft_sR_f32_len64;
674                          break;
675                        case 128:
676                          S = &arm_cfft_sR_f32_len128;
677                          break;
678                        case 256:
679                          S = &arm_cfft_sR_f32_len256;
680                          break;
681                        case 512:
682                          S = &arm_cfft_sR_f32_len512;
683                          break;
684                        case 1024:
685                          S = &arm_cfft_sR_f32_len1024;
686                          break;
687                        case 2048:
688                          S = &arm_cfft_sR_f32_len2048;
689                          break;
690                        case 4096:
691                          S = &arm_cfft_sR_f32_len4096;
692                          break;
693                      }
694   @endcode
695   @par
696                    The new arm_cfft_init_f32 can also be used.
697   @par Q15 and Q31
698                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-4
699                    stages are performed along with a single radix-2 stage, as needed.
700                    The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
701                    a different twiddle factor table.
702   @par
703                    The function uses the standard FFT definition and output values may grow by a
704                    factor of <code>fftLen</code> when computing the forward transform.  The
705                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
706                    calculation and this matches the textbook definition of the inverse FFT.
707   @par
708                    Pre-initialized data structures containing twiddle factors and bit reversal
709                    tables are provided and defined in <code>arm_const_structs.h</code>.  Include
710                    this header in your function and then pass one of the constant structures as
711                    an argument to arm_cfft_q31. For example:
712   @par
713                    <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
714   @par
715                    computes a 64-point inverse complex FFT including bit reversal.
716                    The data structures are treated as constant data and not modified during the
717                    calculation.  The same data structure can be reused for multiple transforms
718                    including mixing forward and inverse transforms.
719   @par
720                    Earlier releases of the library provided separate radix-2 and radix-4
721                    algorithms that operated on floating-point data.  These functions are still
722                    provided but are deprecated.  The older functions are slower and less general
723                    than the new functions.
724   @par
725                    An example of initialization of the constants for the arm_cfft_q31 function follows:
726   @code
727                    const static arm_cfft_instance_q31 *S;
728                    ...
729                      switch (length) {
730                        case 16:
731                          S = &arm_cfft_sR_q31_len16;
732                          break;
733                        case 32:
734                          S = &arm_cfft_sR_q31_len32;
735                          break;
736                        case 64:
737                          S = &arm_cfft_sR_q31_len64;
738                          break;
739                        case 128:
740                          S = &arm_cfft_sR_q31_len128;
741                          break;
742                        case 256:
743                          S = &arm_cfft_sR_q31_len256;
744                          break;
745                        case 512:
746                          S = &arm_cfft_sR_q31_len512;
747                          break;
748                        case 1024:
749                          S = &arm_cfft_sR_q31_len1024;
750                          break;
751                        case 2048:
752                          S = &arm_cfft_sR_q31_len2048;
753                          break;
754                        case 4096:
755                          S = &arm_cfft_sR_q31_len4096;
756                          break;
757                      }
758   @endcode
759 
760  */
761 
762 
763 /**
764   @addtogroup ComplexFFT
765   @{
766  */
767 
768 /**
769   @brief         Processing function for the floating-point complex FFT.
770   @param[in]     S              points to an instance of the floating-point CFFT structure
771   @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
772   @param[in]     ifftFlag       flag that selects transform direction
773                    - value = 0: forward transform
774                    - value = 1: inverse transform
775   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
776                    - value = 0: disables bit reversal of output
777                    - value = 1: enables bit reversal of output
778   @return        none
779  */
780 
arm_cfft_f16(const arm_cfft_instance_f16 * S,float16_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)781 void arm_cfft_f16(
782     const arm_cfft_instance_f16 * S,
783     float16_t * p1,
784     uint8_t ifftFlag,
785     uint8_t bitReverseFlag)
786 {
787     uint32_t  L = S->fftLen, l;
788     float16_t invL, * pSrc;
789 
790     if (ifftFlag == 1U)
791     {
792         /*  Conjugate input data  */
793         pSrc = p1 + 1;
794         for(l=0; l<L; l++)
795         {
796             *pSrc = -*pSrc;
797             pSrc += 2;
798         }
799     }
800 
801     switch (L)
802     {
803 
804         case 16:
805         case 64:
806         case 256:
807         case 1024:
808         case 4096:
809         arm_radix4_butterfly_f16  (p1, L, (float16_t*)S->pTwiddle, 1U);
810         break;
811 
812         case 32:
813         case 128:
814         case 512:
815         case 2048:
816         arm_cfft_radix4by2_f16  ( p1, L, (float16_t*)S->pTwiddle);
817         break;
818 
819     }
820 
821     if ( bitReverseFlag )
822         arm_bitreversal_16((uint16_t*)p1, S->bitRevLength,(uint16_t*)S->pBitRevTable);
823 
824     if (ifftFlag == 1U)
825     {
826         invL = 1.0f/(float16_t)L;
827         /*  Conjugate and scale output data */
828         pSrc = p1;
829         for(l=0; l<L; l++)
830         {
831             *pSrc++ *=   invL ;
832             *pSrc  = -(*pSrc) * invL;
833             pSrc++;
834         }
835     }
836 }
837 #endif /* if defined(ARM_FLOAT16_SUPPORTED) */
838 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
839 
840 /**
841   @} end of ComplexFFT group
842  */
843