1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_f32.c
4  * Description:  Combined Radix Decimation in Frequency CFFT Floating point processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions.h"
30 #include "arm_common_tables.h"
31 
32 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
33 
34 #include "arm_helium_utils.h"
35 #include "arm_vec_fft.h"
36 #include "arm_mve_tables.h"
37 
38 
arm_inverse_fft_length_f32(uint16_t fftLen)39 static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
40 {
41   float32_t retValue=1.0;
42 
43   switch (fftLen)
44   {
45 
46   case 4096U:
47     retValue = 0.000244140625;
48     break;
49 
50   case 2048U:
51     retValue = 0.00048828125;
52     break;
53 
54   case 1024U:
55     retValue = 0.0009765625f;
56     break;
57 
58   case 512U:
59     retValue = 0.001953125;
60     break;
61 
62   case 256U:
63     retValue = 0.00390625f;
64     break;
65 
66   case 128U:
67     retValue = 0.0078125;
68     break;
69 
70   case 64U:
71     retValue = 0.015625f;
72     break;
73 
74   case 32U:
75     retValue = 0.03125;
76     break;
77 
78   case 16U:
79     retValue = 0.0625f;
80     break;
81 
82 
83   default:
84     break;
85   }
86   return(retValue);
87 }
88 
89 
90 
91 
_arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)92 static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
93 {
94     f32x4_t vecTmp0, vecTmp1;
95     f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
96     f32x4_t vecA, vecB, vecC, vecD;
97     uint32_t  blkCnt;
98     uint32_t  n1, n2;
99     uint32_t  stage = 0;
100     int32_t  iter = 1;
101     static const uint32_t strides[4] = {
102         (0 - 16) * sizeof(q31_t *),
103         (1 - 16) * sizeof(q31_t *),
104         (8 - 16) * sizeof(q31_t *),
105         (9 - 16) * sizeof(q31_t *)
106     };
107 
108     n2 = fftLen;
109     n1 = n2;
110     n2 >>= 2u;
111     for (int k = fftLen / 4u; k > 1; k >>= 2)
112     {
113         for (int i = 0; i < iter; i++)
114         {
115             float32_t const     *p_rearranged_twiddle_tab_stride1 =
116                                 &S->rearranged_twiddle_stride1[
117                                 S->rearranged_twiddle_tab_stride1_arr[stage]];
118             float32_t const     *p_rearranged_twiddle_tab_stride2 =
119                                 &S->rearranged_twiddle_stride2[
120                                 S->rearranged_twiddle_tab_stride2_arr[stage]];
121             float32_t const     *p_rearranged_twiddle_tab_stride3 =
122                                 &S->rearranged_twiddle_stride3[
123                                 S->rearranged_twiddle_tab_stride3_arr[stage]];
124             float32_t const    *pW1, *pW2, *pW3;
125             float32_t           *inA = pSrc + CMPLX_DIM * i * n1;
126             float32_t           *inB = inA + n2 * CMPLX_DIM;
127             float32_t           *inC = inB + n2 * CMPLX_DIM;
128             float32_t           *inD = inC + n2 * CMPLX_DIM;
129             f32x4_t            vecW;
130 
131 
132             pW1 = p_rearranged_twiddle_tab_stride1;
133             pW2 = p_rearranged_twiddle_tab_stride2;
134             pW3 = p_rearranged_twiddle_tab_stride3;
135 
136             blkCnt = n2 / 2;
137             /*
138              * load 2 f32 complex pair
139              */
140             vecA = vldrwq_f32(inA);
141             vecC = vldrwq_f32(inC);
142             while (blkCnt > 0U)
143             {
144                 vecB = vldrwq_f32(inB);
145                 vecD = vldrwq_f32(inD);
146 
147                 vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
148                 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
149 
150                 vecSum1 = vecB + vecD;
151                 vecDiff1 = vecB - vecD;
152                 /*
153                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
154                  */
155                 vecTmp0 = vecSum0 + vecSum1;
156                 vst1q(inA, vecTmp0);
157                 inA += 4;
158 
159                 /*
160                  * [ 1 -1 1 -1 ] * [ A B C D ]'
161                  */
162                 vecTmp0 = vecSum0 - vecSum1;
163                 /*
164                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
165                  */
166                 vecW = vld1q(pW2);
167                 pW2 += 4;
168                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
169                 vst1q(inB, vecTmp1);
170                 inB += 4;
171 
172                 /*
173                  * [ 1 -i -1 +i ] * [ A B C D ]'
174                  */
175                 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
176                 /*
177                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
178                  */
179                 vecW = vld1q(pW1);
180                 pW1 +=4;
181                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
182                 vst1q(inC, vecTmp1);
183                 inC += 4;
184 
185                 /*
186                  * [ 1 +i -1 -i ] * [ A B C D ]'
187                  */
188                 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
189                 /*
190                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
191                  */
192                 vecW = vld1q(pW3);
193                 pW3 += 4;
194                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
195                 vst1q(inD, vecTmp1);
196                 inD += 4;
197 
198                 vecA = vldrwq_f32(inA);
199                 vecC = vldrwq_f32(inC);
200 
201                 blkCnt--;
202             }
203         }
204         n1 = n2;
205         n2 >>= 2u;
206         iter = iter << 2;
207         stage++;
208     }
209 
210     /*
211      * start of Last stage process
212      */
213     uint32x4_t vecScGathAddr = vld1q_u32(strides);
214     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
215 
216     /* load scheduling */
217     vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
218     vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
219 
220     blkCnt = (fftLen >> 3);
221     while (blkCnt > 0U)
222     {
223         vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
224         vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
225 
226         vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
227         vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
228 
229         vecSum1 = vecB + vecD;
230         vecDiff1 = vecB - vecD;
231 
232         /* pre-load for next iteration */
233         vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
234         vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
235 
236         vecTmp0 = vecSum0 + vecSum1;
237         vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
238 
239         vecTmp0 = vecSum0 - vecSum1;
240         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
241 
242         vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
243         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
244 
245         vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
246         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
247 
248         blkCnt--;
249     }
250 
251     /*
252      * End of last stage process
253      */
254 }
255 
arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)256 static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_t *pSrc, uint32_t fftLen)
257 {
258     float32_t const *pCoefVec;
259     float32_t const  *pCoef = S->pTwiddle;
260     float32_t        *pIn0, *pIn1;
261     uint32_t          n2;
262     uint32_t          blkCnt;
263     f32x4_t         vecIn0, vecIn1, vecSum, vecDiff;
264     f32x4_t         vecCmplxTmp, vecTw;
265 
266 
267     n2 = fftLen >> 1;
268     pIn0 = pSrc;
269     pIn1 = pSrc + fftLen;
270     pCoefVec = pCoef;
271 
272     blkCnt = n2 / 2;
273     while (blkCnt > 0U)
274     {
275         vecIn0 = *(f32x4_t *) pIn0;
276         vecIn1 = *(f32x4_t *) pIn1;
277         vecTw = vld1q(pCoefVec);
278         pCoefVec += 4;
279 
280         vecSum = vecIn0 + vecIn1;
281         vecDiff = vecIn0 - vecIn1;
282 
283         vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
284 
285         vst1q(pIn0, vecSum);
286         pIn0 += 4;
287         vst1q(pIn1, vecCmplxTmp);
288         pIn1 += 4;
289 
290         blkCnt--;
291     }
292 
293     _arm_radix4_butterfly_f32_mve(S, pSrc, n2);
294 
295     _arm_radix4_butterfly_f32_mve(S, pSrc + fftLen, n2);
296 
297     pIn0 = pSrc;
298 }
299 
_arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen,float32_t onebyfftLen)300 static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen, float32_t onebyfftLen)
301 {
302     f32x4_t vecTmp0, vecTmp1;
303     f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
304     f32x4_t vecA, vecB, vecC, vecD;
305     f32x4_t vecW;
306     uint32_t  blkCnt;
307     uint32_t  n1, n2;
308     uint32_t  stage = 0;
309     int32_t  iter = 1;
310     static const uint32_t strides[4] = {
311         (0 - 16) * sizeof(q31_t *),
312         (1 - 16) * sizeof(q31_t *),
313         (8 - 16) * sizeof(q31_t *),
314         (9 - 16) * sizeof(q31_t *)
315     };
316 
317     n2 = fftLen;
318     n1 = n2;
319     n2 >>= 2u;
320     for (int k = fftLen / 4; k > 1; k >>= 2)
321     {
322         for (int i = 0; i < iter; i++)
323         {
324             float32_t const *p_rearranged_twiddle_tab_stride1 =
325                     &S->rearranged_twiddle_stride1[
326                     S->rearranged_twiddle_tab_stride1_arr[stage]];
327             float32_t const *p_rearranged_twiddle_tab_stride2 =
328                     &S->rearranged_twiddle_stride2[
329                     S->rearranged_twiddle_tab_stride2_arr[stage]];
330             float32_t const *p_rearranged_twiddle_tab_stride3 =
331                     &S->rearranged_twiddle_stride3[
332                     S->rearranged_twiddle_tab_stride3_arr[stage]];
333             float32_t const *pW1, *pW2, *pW3;
334             float32_t *inA = pSrc + CMPLX_DIM * i * n1;
335             float32_t *inB = inA + n2 * CMPLX_DIM;
336             float32_t *inC = inB + n2 * CMPLX_DIM;
337             float32_t *inD = inC + n2 * CMPLX_DIM;
338 
339             pW1 = p_rearranged_twiddle_tab_stride1;
340             pW2 = p_rearranged_twiddle_tab_stride2;
341             pW3 = p_rearranged_twiddle_tab_stride3;
342 
343             blkCnt = n2 / 2;
344             /*
345              * load 2 f32 complex pair
346              */
347             vecA = vldrwq_f32(inA);
348             vecC = vldrwq_f32(inC);
349             while (blkCnt > 0U)
350             {
351                 vecB = vldrwq_f32(inB);
352                 vecD = vldrwq_f32(inD);
353 
354                 vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
355                 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
356 
357                 vecSum1 = vecB + vecD;
358                 vecDiff1 = vecB - vecD;
359                 /*
360                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
361                  */
362                 vecTmp0 = vecSum0 + vecSum1;
363                 vst1q(inA, vecTmp0);
364                 inA += 4;
365                 /*
366                  * [ 1 -1 1 -1 ] * [ A B C D ]'
367                  */
368                 vecTmp0 = vecSum0 - vecSum1;
369                 /*
370                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
371                  */
372                 vecW = vld1q(pW2);
373                 pW2 += 4;
374                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
375                 vst1q(inB, vecTmp1);
376                 inB += 4;
377 
378                 /*
379                  * [ 1 -i -1 +i ] * [ A B C D ]'
380                  */
381                 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
382                 /*
383                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
384                  */
385                 vecW = vld1q(pW1);
386                 pW1 += 4;
387                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
388                 vst1q(inC, vecTmp1);
389                 inC += 4;
390 
391                 /*
392                  * [ 1 +i -1 -i ] * [ A B C D ]'
393                  */
394                 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
395                 /*
396                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
397                  */
398                 vecW = vld1q(pW3);
399                 pW3 += 4;
400                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
401                 vst1q(inD, vecTmp1);
402                 inD += 4;
403 
404                 vecA = vldrwq_f32(inA);
405                 vecC = vldrwq_f32(inC);
406 
407                 blkCnt--;
408             }
409         }
410         n1 = n2;
411         n2 >>= 2u;
412         iter = iter << 2;
413         stage++;
414     }
415 
416     /*
417      * start of Last stage process
418      */
419     uint32x4_t vecScGathAddr = vld1q_u32 (strides);
420     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
421 
422     /*
423      * load scheduling
424      */
425     vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
426     vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
427 
428     blkCnt = (fftLen >> 3);
429     while (blkCnt > 0U)
430     {
431         vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
432         vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
433 
434         vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
435         vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
436 
437         vecSum1 = vecB + vecD;
438         vecDiff1 = vecB - vecD;
439 
440         vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
441         vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
442 
443         vecTmp0 = vecSum0 + vecSum1;
444         vecTmp0 = vecTmp0 * onebyfftLen;
445         vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
446 
447         vecTmp0 = vecSum0 - vecSum1;
448         vecTmp0 = vecTmp0 * onebyfftLen;
449         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
450 
451         vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
452         vecTmp0 = vecTmp0 * onebyfftLen;
453         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
454 
455         vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
456         vecTmp0 = vecTmp0 * onebyfftLen;
457         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
458 
459         blkCnt--;
460     }
461 
462     /*
463      * End of last stage process
464      */
465 }
466 
arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)467 static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t *pSrc, uint32_t fftLen)
468 {
469     float32_t const *pCoefVec;
470     float32_t const  *pCoef = S->pTwiddle;
471     float32_t        *pIn0, *pIn1;
472     uint32_t          n2;
473     float32_t         onebyfftLen = arm_inverse_fft_length_f32(fftLen);
474     uint32_t          blkCnt;
475     f32x4_t         vecIn0, vecIn1, vecSum, vecDiff;
476     f32x4_t         vecCmplxTmp, vecTw;
477 
478 
479     n2 = fftLen >> 1;
480     pIn0 = pSrc;
481     pIn1 = pSrc + fftLen;
482     pCoefVec = pCoef;
483 
484     blkCnt = n2 / 2;
485     while (blkCnt > 0U)
486     {
487         vecIn0 = *(f32x4_t *) pIn0;
488         vecIn1 = *(f32x4_t *) pIn1;
489         vecTw = vld1q(pCoefVec);
490         pCoefVec += 4;
491 
492         vecSum = vecIn0 + vecIn1;
493         vecDiff = vecIn0 - vecIn1;
494 
495         vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
496 
497         vst1q(pIn0, vecSum);
498         pIn0 += 4;
499         vst1q(pIn1, vecCmplxTmp);
500         pIn1 += 4;
501 
502         blkCnt--;
503     }
504 
505     _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, n2, onebyfftLen);
506 
507     _arm_radix4_butterfly_inverse_f32_mve(S, pSrc + fftLen, n2, onebyfftLen);
508 }
509 
510 
511 /**
512   @addtogroup ComplexFFT
513   @{
514  */
515 
516 /**
517   @brief         Processing function for the floating-point complex FFT.
518   @param[in]     S              points to an instance of the floating-point CFFT structure
519   @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
520   @param[in]     ifftFlag       flag that selects transform direction
521                    - value = 0: forward transform
522                    - value = 1: inverse transform
523   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
524                    - value = 0: disables bit reversal of output
525                    - value = 1: enables bit reversal of output
526   @return        none
527  */
528 
529 
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)530 void arm_cfft_f32(
531   const arm_cfft_instance_f32 * S,
532         float32_t * pSrc,
533         uint8_t ifftFlag,
534         uint8_t bitReverseFlag)
535 {
536         uint32_t fftLen = S->fftLen;
537 
538         if (ifftFlag == 1U) {
539 
540             switch (fftLen) {
541             case 16:
542             case 64:
543             case 256:
544             case 1024:
545             case 4096:
546                 _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
547                 break;
548 
549             case 32:
550             case 128:
551             case 512:
552             case 2048:
553                 arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
554                 break;
555             }
556         } else {
557             switch (fftLen) {
558             case 16:
559             case 64:
560             case 256:
561             case 1024:
562             case 4096:
563                 _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
564                 break;
565 
566             case 32:
567             case 128:
568             case 512:
569             case 2048:
570                 arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
571                 break;
572             }
573         }
574 
575 
576         if (bitReverseFlag)
577         {
578 
579             arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
580 
581         }
582 }
583 
584 
585 #else
586 extern void arm_radix8_butterfly_f32(
587         float32_t * pSrc,
588         uint16_t fftLen,
589   const float32_t * pCoef,
590         uint16_t twidCoefModifier);
591 
592 extern void arm_bitreversal_32(
593         uint32_t * pSrc,
594   const uint16_t bitRevLen,
595   const uint16_t * pBitRevTable);
596 
597 /**
598   @ingroup groupTransforms
599  */
600 
601 /**
602   @defgroup ComplexFFT Complex FFT Functions
603 
604   @par
605                    The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
606                    Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
607                    than the DFT, especially for long lengths.
608                    The algorithms described in this section
609                    operate on complex data.  A separate set of functions is devoted to handling
610                    of real sequences.
611   @par
612                    There are separate algorithms for handling floating-point, Q15, and Q31 data
613                    types.  The algorithms available for each data type are described next.
614   @par
615                    The FFT functions operate in-place.  That is, the array holding the input data
616                    will also be used to hold the corresponding result.  The input data is complex
617                    and contains <code>2*fftLen</code> interleaved values as shown below.
618                    <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
619                    The FFT result will be contained in the same array and the frequency domain
620                    values will have the same interleaving.
621 
622   @par Floating-point
623                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
624                    stages are performed along with a single radix-2 or radix-4 stage, as needed.
625                    The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
626                    a different twiddle factor table.
627   @par
628                    The function uses the standard FFT definition and output values may grow by a
629                    factor of <code>fftLen</code> when computing the forward transform.  The
630                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
631                    calculation and this matches the textbook definition of the inverse FFT.
632   @par
633                    For the MVE version, the new arm_cfft_init_f32 initialization function is
634                    <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
635                    needed FFTs.</b> Other FFT versions can continue to be initialized as
636                    explained below.
637   @par
638                    For not MVE versions, pre-initialized data structures containing twiddle factors
639                    and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>.  Include
640                    this header in your function and then pass one of the constant structures as
641                    an argument to arm_cfft_f32.  For example:
642   @par
643                    <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
644   @par
645                    computes a 64-point inverse complex FFT including bit reversal.
646                    The data structures are treated as constant data and not modified during the
647                    calculation.  The same data structure can be reused for multiple transforms
648                    including mixing forward and inverse transforms.
649   @par
650                    Earlier releases of the library provided separate radix-2 and radix-4
651                    algorithms that operated on floating-point data.  These functions are still
652                    provided but are deprecated.  The older functions are slower and less general
653                    than the new functions.
654   @par
655                    An example of initialization of the constants for the arm_cfft_f32 function follows:
656   @code
657                    const static arm_cfft_instance_f32 *S;
658                    ...
659                      switch (length) {
660                        case 16:
661                          S = &arm_cfft_sR_f32_len16;
662                          break;
663                        case 32:
664                          S = &arm_cfft_sR_f32_len32;
665                          break;
666                        case 64:
667                          S = &arm_cfft_sR_f32_len64;
668                          break;
669                        case 128:
670                          S = &arm_cfft_sR_f32_len128;
671                          break;
672                        case 256:
673                          S = &arm_cfft_sR_f32_len256;
674                          break;
675                        case 512:
676                          S = &arm_cfft_sR_f32_len512;
677                          break;
678                        case 1024:
679                          S = &arm_cfft_sR_f32_len1024;
680                          break;
681                        case 2048:
682                          S = &arm_cfft_sR_f32_len2048;
683                          break;
684                        case 4096:
685                          S = &arm_cfft_sR_f32_len4096;
686                          break;
687                      }
688   @endcode
689   @par
690                    The new arm_cfft_init_f32 can also be used.
691   @par Q15 and Q31
692                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-4
693                    stages are performed along with a single radix-2 stage, as needed.
694                    The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
695                    a different twiddle factor table.
696   @par
697                    The function uses the standard FFT definition and output values may grow by a
698                    factor of <code>fftLen</code> when computing the forward transform.  The
699                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
700                    calculation and this matches the textbook definition of the inverse FFT.
701   @par
702                    Pre-initialized data structures containing twiddle factors and bit reversal
703                    tables are provided and defined in <code>arm_const_structs.h</code>.  Include
704                    this header in your function and then pass one of the constant structures as
705                    an argument to arm_cfft_q31. For example:
706   @par
707                    <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
708   @par
709                    computes a 64-point inverse complex FFT including bit reversal.
710                    The data structures are treated as constant data and not modified during the
711                    calculation.  The same data structure can be reused for multiple transforms
712                    including mixing forward and inverse transforms.
713   @par
714                    Earlier releases of the library provided separate radix-2 and radix-4
715                    algorithms that operated on floating-point data.  These functions are still
716                    provided but are deprecated.  The older functions are slower and less general
717                    than the new functions.
718   @par
719                    An example of initialization of the constants for the arm_cfft_q31 function follows:
720   @code
721                    const static arm_cfft_instance_q31 *S;
722                    ...
723                      switch (length) {
724                        case 16:
725                          S = &arm_cfft_sR_q31_len16;
726                          break;
727                        case 32:
728                          S = &arm_cfft_sR_q31_len32;
729                          break;
730                        case 64:
731                          S = &arm_cfft_sR_q31_len64;
732                          break;
733                        case 128:
734                          S = &arm_cfft_sR_q31_len128;
735                          break;
736                        case 256:
737                          S = &arm_cfft_sR_q31_len256;
738                          break;
739                        case 512:
740                          S = &arm_cfft_sR_q31_len512;
741                          break;
742                        case 1024:
743                          S = &arm_cfft_sR_q31_len1024;
744                          break;
745                        case 2048:
746                          S = &arm_cfft_sR_q31_len2048;
747                          break;
748                        case 4096:
749                          S = &arm_cfft_sR_q31_len4096;
750                          break;
751                      }
752   @endcode
753 
754  */
755 
arm_cfft_radix8by2_f32(arm_cfft_instance_f32 * S,float32_t * p1)756 void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
757 {
758   uint32_t    L  = S->fftLen;
759   float32_t * pCol1, * pCol2, * pMid1, * pMid2;
760   float32_t * p2 = p1 + L;
761   const float32_t * tw = (float32_t *) S->pTwiddle;
762   float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
763   float32_t m0, m1, m2, m3;
764   uint32_t l;
765 
766   pCol1 = p1;
767   pCol2 = p2;
768 
769   /* Define new length */
770   L >>= 1;
771 
772   /* Initialize mid pointers */
773   pMid1 = p1 + L;
774   pMid2 = p2 + L;
775 
776   /* do two dot Fourier transform */
777   for (l = L >> 2; l > 0; l-- )
778   {
779     t1[0] = p1[0];
780     t1[1] = p1[1];
781     t1[2] = p1[2];
782     t1[3] = p1[3];
783 
784     t2[0] = p2[0];
785     t2[1] = p2[1];
786     t2[2] = p2[2];
787     t2[3] = p2[3];
788 
789     t3[0] = pMid1[0];
790     t3[1] = pMid1[1];
791     t3[2] = pMid1[2];
792     t3[3] = pMid1[3];
793 
794     t4[0] = pMid2[0];
795     t4[1] = pMid2[1];
796     t4[2] = pMid2[2];
797     t4[3] = pMid2[3];
798 
799     *p1++ = t1[0] + t2[0];
800     *p1++ = t1[1] + t2[1];
801     *p1++ = t1[2] + t2[2];
802     *p1++ = t1[3] + t2[3];    /* col 1 */
803 
804     t2[0] = t1[0] - t2[0];
805     t2[1] = t1[1] - t2[1];
806     t2[2] = t1[2] - t2[2];
807     t2[3] = t1[3] - t2[3];    /* for col 2 */
808 
809     *pMid1++ = t3[0] + t4[0];
810     *pMid1++ = t3[1] + t4[1];
811     *pMid1++ = t3[2] + t4[2];
812     *pMid1++ = t3[3] + t4[3]; /* col 1 */
813 
814     t4[0] = t4[0] - t3[0];
815     t4[1] = t4[1] - t3[1];
816     t4[2] = t4[2] - t3[2];
817     t4[3] = t4[3] - t3[3];    /* for col 2 */
818 
819     twR = *tw++;
820     twI = *tw++;
821 
822     /* multiply by twiddle factors */
823     m0 = t2[0] * twR;
824     m1 = t2[1] * twI;
825     m2 = t2[1] * twR;
826     m3 = t2[0] * twI;
827 
828     /* R  =  R  *  Tr - I * Ti */
829     *p2++ = m0 + m1;
830     /* I  =  I  *  Tr + R * Ti */
831     *p2++ = m2 - m3;
832 
833     /* use vertical symmetry */
834     /*  0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
835     m0 = t4[0] * twI;
836     m1 = t4[1] * twR;
837     m2 = t4[1] * twI;
838     m3 = t4[0] * twR;
839 
840     *pMid2++ = m0 - m1;
841     *pMid2++ = m2 + m3;
842 
843     twR = *tw++;
844     twI = *tw++;
845 
846     m0 = t2[2] * twR;
847     m1 = t2[3] * twI;
848     m2 = t2[3] * twR;
849     m3 = t2[2] * twI;
850 
851     *p2++ = m0 + m1;
852     *p2++ = m2 - m3;
853 
854     m0 = t4[2] * twI;
855     m1 = t4[3] * twR;
856     m2 = t4[3] * twI;
857     m3 = t4[2] * twR;
858 
859     *pMid2++ = m0 - m1;
860     *pMid2++ = m2 + m3;
861   }
862 
863   /* first col */
864   arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);
865 
866   /* second col */
867   arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
868 }
869 
arm_cfft_radix8by4_f32(arm_cfft_instance_f32 * S,float32_t * p1)870 void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
871 {
872     uint32_t    L  = S->fftLen >> 1;
873     float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
874     const float32_t *tw2, *tw3, *tw4;
875     float32_t * p2 = p1 + L;
876     float32_t * p3 = p2 + L;
877     float32_t * p4 = p3 + L;
878     float32_t t2[4], t3[4], t4[4], twR, twI;
879     float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
880     float32_t m0, m1, m2, m3;
881     uint32_t l, twMod2, twMod3, twMod4;
882 
883     pCol1 = p1;         /* points to real values by default */
884     pCol2 = p2;
885     pCol3 = p3;
886     pCol4 = p4;
887     pEnd1 = p2 - 1;     /* points to imaginary values by default */
888     pEnd2 = p3 - 1;
889     pEnd3 = p4 - 1;
890     pEnd4 = pEnd3 + L;
891 
892     tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;
893 
894     L >>= 1;
895 
896     /* do four dot Fourier transform */
897 
898     twMod2 = 2;
899     twMod3 = 4;
900     twMod4 = 6;
901 
902     /* TOP */
903     p1ap3_0 = p1[0] + p3[0];
904     p1sp3_0 = p1[0] - p3[0];
905     p1ap3_1 = p1[1] + p3[1];
906     p1sp3_1 = p1[1] - p3[1];
907 
908     /* col 2 */
909     t2[0] = p1sp3_0 + p2[1] - p4[1];
910     t2[1] = p1sp3_1 - p2[0] + p4[0];
911     /* col 3 */
912     t3[0] = p1ap3_0 - p2[0] - p4[0];
913     t3[1] = p1ap3_1 - p2[1] - p4[1];
914     /* col 4 */
915     t4[0] = p1sp3_0 - p2[1] + p4[1];
916     t4[1] = p1sp3_1 + p2[0] - p4[0];
917     /* col 1 */
918     *p1++ = p1ap3_0 + p2[0] + p4[0];
919     *p1++ = p1ap3_1 + p2[1] + p4[1];
920 
921     /* Twiddle factors are ones */
922     *p2++ = t2[0];
923     *p2++ = t2[1];
924     *p3++ = t3[0];
925     *p3++ = t3[1];
926     *p4++ = t4[0];
927     *p4++ = t4[1];
928 
929     tw2 += twMod2;
930     tw3 += twMod3;
931     tw4 += twMod4;
932 
933     for (l = (L - 2) >> 1; l > 0; l-- )
934     {
935       /* TOP */
936       p1ap3_0 = p1[0] + p3[0];
937       p1sp3_0 = p1[0] - p3[0];
938       p1ap3_1 = p1[1] + p3[1];
939       p1sp3_1 = p1[1] - p3[1];
940       /* col 2 */
941       t2[0] = p1sp3_0 + p2[1] - p4[1];
942       t2[1] = p1sp3_1 - p2[0] + p4[0];
943       /* col 3 */
944       t3[0] = p1ap3_0 - p2[0] - p4[0];
945       t3[1] = p1ap3_1 - p2[1] - p4[1];
946       /* col 4 */
947       t4[0] = p1sp3_0 - p2[1] + p4[1];
948       t4[1] = p1sp3_1 + p2[0] - p4[0];
949       /* col 1 - top */
950       *p1++ = p1ap3_0 + p2[0] + p4[0];
951       *p1++ = p1ap3_1 + p2[1] + p4[1];
952 
953       /* BOTTOM */
954       p1ap3_1 = pEnd1[-1] + pEnd3[-1];
955       p1sp3_1 = pEnd1[-1] - pEnd3[-1];
956       p1ap3_0 = pEnd1[ 0] + pEnd3[0];
957       p1sp3_0 = pEnd1[ 0] - pEnd3[0];
958       /* col 2 */
959       t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
960       t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
961       /* col 3 */
962       t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
963       t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
964       /* col 4 */
965       t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
966       t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
967       /* col 1 - Bottom */
968       *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
969       *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
970 
971       /* COL 2 */
972       /* read twiddle factors */
973       twR = *tw2++;
974       twI = *tw2++;
975       /* multiply by twiddle factors */
976       /*  let    Z1 = a + i(b),   Z2 = c + i(d) */
977       /*   =>  Z1 * Z2  =  (a*c - b*d) + i(b*c + a*d) */
978 
979       /* Top */
980       m0 = t2[0] * twR;
981       m1 = t2[1] * twI;
982       m2 = t2[1] * twR;
983       m3 = t2[0] * twI;
984 
985       *p2++ = m0 + m1;
986       *p2++ = m2 - m3;
987       /* use vertical symmetry col 2 */
988       /* 0.9997 - 0.0245i  <==>  0.0245 - 0.9997i */
989       /* Bottom */
990       m0 = t2[3] * twI;
991       m1 = t2[2] * twR;
992       m2 = t2[2] * twI;
993       m3 = t2[3] * twR;
994 
995       *pEnd2-- = m0 - m1;
996       *pEnd2-- = m2 + m3;
997 
998       /* COL 3 */
999       twR = tw3[0];
1000       twI = tw3[1];
1001       tw3 += twMod3;
1002       /* Top */
1003       m0 = t3[0] * twR;
1004       m1 = t3[1] * twI;
1005       m2 = t3[1] * twR;
1006       m3 = t3[0] * twI;
1007 
1008       *p3++ = m0 + m1;
1009       *p3++ = m2 - m3;
1010       /* use vertical symmetry col 3 */
1011       /* 0.9988 - 0.0491i  <==>  -0.9988 - 0.0491i */
1012       /* Bottom */
1013       m0 = -t3[3] * twR;
1014       m1 =  t3[2] * twI;
1015       m2 =  t3[2] * twR;
1016       m3 =  t3[3] * twI;
1017 
1018       *pEnd3-- = m0 - m1;
1019       *pEnd3-- = m3 - m2;
1020 
1021       /* COL 4 */
1022       twR = tw4[0];
1023       twI = tw4[1];
1024       tw4 += twMod4;
1025       /* Top */
1026       m0 = t4[0] * twR;
1027       m1 = t4[1] * twI;
1028       m2 = t4[1] * twR;
1029       m3 = t4[0] * twI;
1030 
1031       *p4++ = m0 + m1;
1032       *p4++ = m2 - m3;
1033       /* use vertical symmetry col 4 */
1034       /* 0.9973 - 0.0736i  <==>  -0.0736 + 0.9973i */
1035       /* Bottom */
1036       m0 = t4[3] * twI;
1037       m1 = t4[2] * twR;
1038       m2 = t4[2] * twI;
1039       m3 = t4[3] * twR;
1040 
1041       *pEnd4-- = m0 - m1;
1042       *pEnd4-- = m2 + m3;
1043     }
1044 
1045     /* MIDDLE */
1046     /* Twiddle factors are */
1047     /*  1.0000  0.7071-0.7071i  -1.0000i  -0.7071-0.7071i */
1048     p1ap3_0 = p1[0] + p3[0];
1049     p1sp3_0 = p1[0] - p3[0];
1050     p1ap3_1 = p1[1] + p3[1];
1051     p1sp3_1 = p1[1] - p3[1];
1052 
1053     /* col 2 */
1054     t2[0] = p1sp3_0 + p2[1] - p4[1];
1055     t2[1] = p1sp3_1 - p2[0] + p4[0];
1056     /* col 3 */
1057     t3[0] = p1ap3_0 - p2[0] - p4[0];
1058     t3[1] = p1ap3_1 - p2[1] - p4[1];
1059     /* col 4 */
1060     t4[0] = p1sp3_0 - p2[1] + p4[1];
1061     t4[1] = p1sp3_1 + p2[0] - p4[0];
1062     /* col 1 - Top */
1063     *p1++ = p1ap3_0 + p2[0] + p4[0];
1064     *p1++ = p1ap3_1 + p2[1] + p4[1];
1065 
1066     /* COL 2 */
1067     twR = tw2[0];
1068     twI = tw2[1];
1069 
1070     m0 = t2[0] * twR;
1071     m1 = t2[1] * twI;
1072     m2 = t2[1] * twR;
1073     m3 = t2[0] * twI;
1074 
1075     *p2++ = m0 + m1;
1076     *p2++ = m2 - m3;
1077     /* COL 3 */
1078     twR = tw3[0];
1079     twI = tw3[1];
1080 
1081     m0 = t3[0] * twR;
1082     m1 = t3[1] * twI;
1083     m2 = t3[1] * twR;
1084     m3 = t3[0] * twI;
1085 
1086     *p3++ = m0 + m1;
1087     *p3++ = m2 - m3;
1088     /* COL 4 */
1089     twR = tw4[0];
1090     twI = tw4[1];
1091 
1092     m0 = t4[0] * twR;
1093     m1 = t4[1] * twI;
1094     m2 = t4[1] * twR;
1095     m3 = t4[0] * twI;
1096 
1097     *p4++ = m0 + m1;
1098     *p4++ = m2 - m3;
1099 
1100     /* first col */
1101     arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);
1102 
1103     /* second col */
1104     arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);
1105 
1106     /* third col */
1107     arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);
1108 
1109     /* fourth col */
1110     arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
1111 }
1112 
1113 /**
1114   @addtogroup ComplexFFT
1115   @{
1116  */
1117 
1118 /**
1119   @brief         Processing function for the floating-point complex FFT.
1120   @param[in]     S              points to an instance of the floating-point CFFT structure
1121   @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
1122   @param[in]     ifftFlag       flag that selects transform direction
1123                    - value = 0: forward transform
1124                    - value = 1: inverse transform
1125   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
1126                    - value = 0: disables bit reversal of output
1127                    - value = 1: enables bit reversal of output
1128   @return        none
1129  */
1130 
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)1131 void arm_cfft_f32(
1132   const arm_cfft_instance_f32 * S,
1133         float32_t * p1,
1134         uint8_t ifftFlag,
1135         uint8_t bitReverseFlag)
1136 {
1137   uint32_t  L = S->fftLen, l;
1138   float32_t invL, * pSrc;
1139 
1140   if (ifftFlag == 1U)
1141   {
1142     /* Conjugate input data */
1143     pSrc = p1 + 1;
1144     for (l = 0; l < L; l++)
1145     {
1146       *pSrc = -*pSrc;
1147       pSrc += 2;
1148     }
1149   }
1150 
1151   switch (L)
1152   {
1153   case 16:
1154   case 128:
1155   case 1024:
1156     arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
1157     break;
1158   case 32:
1159   case 256:
1160   case 2048:
1161     arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
1162     break;
1163   case 64:
1164   case 512:
1165   case 4096:
1166     arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1);
1167     break;
1168   }
1169 
1170   if ( bitReverseFlag )
1171     arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
1172 
1173   if (ifftFlag == 1U)
1174   {
1175     invL = 1.0f / (float32_t)L;
1176 
1177     /* Conjugate and scale output data */
1178     pSrc = p1;
1179     for (l= 0; l < L; l++)
1180     {
1181       *pSrc++ *=   invL ;
1182       *pSrc    = -(*pSrc) * invL;
1183       pSrc++;
1184     }
1185   }
1186 }
1187 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1188 
1189 /**
1190   @} end of ComplexFFT group
1191  */
1192