1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_f32.c
4  * Description:  Combined Radix Decimation in Frequency CFFT Floating point processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions.h"
30 #include "arm_common_tables.h"
31 
32 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
33 
34 #include "arm_helium_utils.h"
35 #include "arm_vec_fft.h"
36 #include "arm_mve_tables.h"
37 
38 
arm_inverse_fft_length_f32(uint16_t fftLen)39 static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
40 {
41   float32_t retValue=1.0;
42 
43   switch (fftLen)
44   {
45 
46   case 4096U:
47     retValue = 0.000244140625;
48     break;
49 
50   case 2048U:
51     retValue = 0.00048828125;
52     break;
53 
54   case 1024U:
55     retValue = 0.0009765625f;
56     break;
57 
58   case 512U:
59     retValue = 0.001953125;
60     break;
61 
62   case 256U:
63     retValue = 0.00390625f;
64     break;
65 
66   case 128U:
67     retValue = 0.0078125;
68     break;
69 
70   case 64U:
71     retValue = 0.015625f;
72     break;
73 
74   case 32U:
75     retValue = 0.03125;
76     break;
77 
78   case 16U:
79     retValue = 0.0625f;
80     break;
81 
82 
83   default:
84     break;
85   }
86   return(retValue);
87 }
88 
89 
90 
91 
_arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)92 static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
93 {
94     f32x4_t     vecTmp0, vecTmp1;
95     f32x4_t     vecSum0, vecDiff0, vecSum1, vecDiff1;
96     f32x4_t     vecA, vecB, vecC, vecD;
97     uint32_t    blkCnt;
98     uint32_t    n1, n2;
99     uint32_t    stage = 0;
100     int32_t     iter = 1;
101     static const int32_t strides[4] = {
102         (0 - 16) * (int32_t)sizeof(q31_t *),
103         (1 - 16) * (int32_t)sizeof(q31_t *),
104         (8 - 16) * (int32_t)sizeof(q31_t *),
105         (9 - 16) * (int32_t)sizeof(q31_t *)
106     };
107 
108     n2 = fftLen;
109     n1 = n2;
110     n2 >>= 2u;
111     for (int k = fftLen / 4u; k > 1; k >>= 2)
112     {
113         float32_t const     *p_rearranged_twiddle_tab_stride1 =
114                             &S->rearranged_twiddle_stride1[
115                             S->rearranged_twiddle_tab_stride1_arr[stage]];
116         float32_t const     *p_rearranged_twiddle_tab_stride2 =
117                             &S->rearranged_twiddle_stride2[
118                             S->rearranged_twiddle_tab_stride2_arr[stage]];
119         float32_t const     *p_rearranged_twiddle_tab_stride3 =
120                             &S->rearranged_twiddle_stride3[
121                             S->rearranged_twiddle_tab_stride3_arr[stage]];
122 
123         float32_t * pBase = pSrc;
124         for (int i = 0; i < iter; i++)
125         {
126             float32_t    *inA = pBase;
127             float32_t    *inB = inA + n2 * CMPLX_DIM;
128             float32_t    *inC = inB + n2 * CMPLX_DIM;
129             float32_t    *inD = inC + n2 * CMPLX_DIM;
130             float32_t const *pW1 = p_rearranged_twiddle_tab_stride1;
131             float32_t const *pW2 = p_rearranged_twiddle_tab_stride2;
132             float32_t const *pW3 = p_rearranged_twiddle_tab_stride3;
133             f32x4_t            vecW;
134 
135             blkCnt = n2 / 2;
136             /*
137              * load 2 f32 complex pair
138              */
139             vecA = vldrwq_f32(inA);
140             vecC = vldrwq_f32(inC);
141             while (blkCnt > 0U)
142             {
143                 vecB = vldrwq_f32(inB);
144                 vecD = vldrwq_f32(inD);
145 
146                 vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
147                 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
148 
149                 vecSum1 = vecB + vecD;
150                 vecDiff1 = vecB - vecD;
151                 /*
152                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
153                  */
154                 vecTmp0 = vecSum0 + vecSum1;
155                 vst1q(inA, vecTmp0);
156                 inA += 4;
157 
158                 /*
159                  * [ 1 -1 1 -1 ] * [ A B C D ]'
160                  */
161                 vecTmp0 = vecSum0 - vecSum1;
162                 /*
163                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
164                  */
165                 vecW = vld1q(pW2);
166                 pW2 += 4;
167                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
168                 vst1q(inB, vecTmp1);
169                 inB += 4;
170 
171                 /*
172                  * [ 1 -i -1 +i ] * [ A B C D ]'
173                  */
174                 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
175                 /*
176                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
177                  */
178                 vecW = vld1q(pW1);
179                 pW1 +=4;
180                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
181                 vst1q(inC, vecTmp1);
182                 inC += 4;
183 
184                 /*
185                  * [ 1 +i -1 -i ] * [ A B C D ]'
186                  */
187                 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
188                 /*
189                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
190                  */
191                 vecW = vld1q(pW3);
192                 pW3 += 4;
193                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
194                 vst1q(inD, vecTmp1);
195                 inD += 4;
196 
197                 vecA = vldrwq_f32(inA);
198                 vecC = vldrwq_f32(inC);
199 
200                 blkCnt--;
201             }
202             pBase +=  CMPLX_DIM * n1;
203         }
204         n1 = n2;
205         n2 >>= 2u;
206         iter = iter << 2;
207         stage++;
208     }
209 
210     /*
211      * start of Last stage process
212      */
213     uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
214     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
215 
216     /* load scheduling */
217     vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
218     vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
219 
220     blkCnt = (fftLen >> 3);
221     while (blkCnt > 0U)
222     {
223         vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
224         vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
225 
226         vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
227         vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
228 
229         vecSum1 = vecB + vecD;
230         vecDiff1 = vecB - vecD;
231 
232         /* pre-load for next iteration */
233         vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
234         vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
235 
236         vecTmp0 = vecSum0 + vecSum1;
237         vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
238 
239         vecTmp0 = vecSum0 - vecSum1;
240         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
241 
242         vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
243         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
244 
245         vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
246         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
247 
248         blkCnt--;
249     }
250 
251     /*
252      * End of last stage process
253      */
254 }
255 
arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)256 static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_t *pSrc, uint32_t fftLen)
257 {
258     float32_t const *pCoefVec;
259     float32_t const  *pCoef = S->pTwiddle;
260     float32_t        *pIn0, *pIn1;
261     uint32_t          n2;
262     uint32_t          blkCnt;
263     f32x4_t         vecIn0, vecIn1, vecSum, vecDiff;
264     f32x4_t         vecCmplxTmp, vecTw;
265 
266 
267     n2 = fftLen >> 1;
268     pIn0 = pSrc;
269     pIn1 = pSrc + fftLen;
270     pCoefVec = pCoef;
271 
272     blkCnt = n2 / 2;
273     while (blkCnt > 0U)
274     {
275         vecIn0 = *(f32x4_t *) pIn0;
276         vecIn1 = *(f32x4_t *) pIn1;
277         vecTw = vld1q(pCoefVec);
278         pCoefVec += 4;
279 
280         vecSum = vecIn0 + vecIn1;
281         vecDiff = vecIn0 - vecIn1;
282 
283         vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
284 
285         vst1q(pIn0, vecSum);
286         pIn0 += 4;
287         vst1q(pIn1, vecCmplxTmp);
288         pIn1 += 4;
289 
290         blkCnt--;
291     }
292 
293     _arm_radix4_butterfly_f32_mve(S, pSrc, n2);
294 
295     _arm_radix4_butterfly_f32_mve(S, pSrc + fftLen, n2);
296 
297     pIn0 = pSrc;
298 }
299 
_arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen,float32_t onebyfftLen)300 static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen, float32_t onebyfftLen)
301 {
302     f32x4_t vecTmp0, vecTmp1;
303     f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
304     f32x4_t vecA, vecB, vecC, vecD;
305     uint32_t  blkCnt;
306     uint32_t  n1, n2;
307     uint32_t  stage = 0;
308     int32_t  iter = 1;
309     static const int32_t strides[4] = {
310         (0 - 16) * (int32_t)sizeof(q31_t *),
311         (1 - 16) * (int32_t)sizeof(q31_t *),
312         (8 - 16) * (int32_t)sizeof(q31_t *),
313         (9 - 16) * (int32_t)sizeof(q31_t *)
314     };
315 
316     n2 = fftLen;
317     n1 = n2;
318     n2 >>= 2u;
319     for (int k = fftLen / 4; k > 1; k >>= 2)
320     {
321         float32_t const *p_rearranged_twiddle_tab_stride1 =
322                 &S->rearranged_twiddle_stride1[
323                 S->rearranged_twiddle_tab_stride1_arr[stage]];
324         float32_t const *p_rearranged_twiddle_tab_stride2 =
325                 &S->rearranged_twiddle_stride2[
326                 S->rearranged_twiddle_tab_stride2_arr[stage]];
327         float32_t const *p_rearranged_twiddle_tab_stride3 =
328                 &S->rearranged_twiddle_stride3[
329                 S->rearranged_twiddle_tab_stride3_arr[stage]];
330 
331         float32_t * pBase = pSrc;
332         for (int i = 0; i < iter; i++)
333         {
334             float32_t    *inA = pBase;
335             float32_t    *inB = inA + n2 * CMPLX_DIM;
336             float32_t    *inC = inB + n2 * CMPLX_DIM;
337             float32_t    *inD = inC + n2 * CMPLX_DIM;
338             float32_t const *pW1 = p_rearranged_twiddle_tab_stride1;
339             float32_t const *pW2 = p_rearranged_twiddle_tab_stride2;
340             float32_t const *pW3 = p_rearranged_twiddle_tab_stride3;
341             f32x4_t       vecW;
342 
343             blkCnt = n2 / 2;
344             /*
345              * load 2 f32 complex pair
346              */
347             vecA = vldrwq_f32(inA);
348             vecC = vldrwq_f32(inC);
349             while (blkCnt > 0U)
350             {
351                 vecB = vldrwq_f32(inB);
352                 vecD = vldrwq_f32(inD);
353 
354                 vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
355                 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
356 
357                 vecSum1 = vecB + vecD;
358                 vecDiff1 = vecB - vecD;
359                 /*
360                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
361                  */
362                 vecTmp0 = vecSum0 + vecSum1;
363                 vst1q(inA, vecTmp0);
364                 inA += 4;
365                 /*
366                  * [ 1 -1 1 -1 ] * [ A B C D ]'
367                  */
368                 vecTmp0 = vecSum0 - vecSum1;
369                 /*
370                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
371                  */
372                 vecW = vld1q(pW2);
373                 pW2 += 4;
374                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
375                 vst1q(inB, vecTmp1);
376                 inB += 4;
377 
378                 /*
379                  * [ 1 -i -1 +i ] * [ A B C D ]'
380                  */
381                 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
382                 /*
383                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
384                  */
385                 vecW = vld1q(pW1);
386                 pW1 += 4;
387                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
388                 vst1q(inC, vecTmp1);
389                 inC += 4;
390 
391                 /*
392                  * [ 1 +i -1 -i ] * [ A B C D ]'
393                  */
394                 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
395                 /*
396                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
397                  */
398                 vecW = vld1q(pW3);
399                 pW3 += 4;
400                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
401                 vst1q(inD, vecTmp1);
402                 inD += 4;
403 
404                 vecA = vldrwq_f32(inA);
405                 vecC = vldrwq_f32(inC);
406 
407                 blkCnt--;
408             }
409             pBase +=  CMPLX_DIM * n1;
410         }
411         n1 = n2;
412         n2 >>= 2u;
413         iter = iter << 2;
414         stage++;
415     }
416 
417     /*
418      * start of Last stage process
419      */
420     uint32x4_t vecScGathAddr = vld1q_u32 ((uint32_t*)strides);
421     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
422 
423     /*
424      * load scheduling
425      */
426     vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
427     vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
428 
429     blkCnt = (fftLen >> 3);
430     while (blkCnt > 0U)
431     {
432         vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
433         vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
434 
435         vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
436         vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
437 
438         vecSum1 = vecB + vecD;
439         vecDiff1 = vecB - vecD;
440 
441         vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
442         vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
443 
444         vecTmp0 = vecSum0 + vecSum1;
445         vecTmp0 = vecTmp0 * onebyfftLen;
446         vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
447 
448         vecTmp0 = vecSum0 - vecSum1;
449         vecTmp0 = vecTmp0 * onebyfftLen;
450         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
451 
452         vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
453         vecTmp0 = vecTmp0 * onebyfftLen;
454         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
455 
456         vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
457         vecTmp0 = vecTmp0 * onebyfftLen;
458         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
459 
460         blkCnt--;
461     }
462 
463     /*
464      * End of last stage process
465      */
466 }
467 
arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)468 static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t *pSrc, uint32_t fftLen)
469 {
470     float32_t const *pCoefVec;
471     float32_t const  *pCoef = S->pTwiddle;
472     float32_t        *pIn0, *pIn1;
473     uint32_t          n2;
474     float32_t         onebyfftLen = arm_inverse_fft_length_f32(fftLen);
475     uint32_t          blkCnt;
476     f32x4_t         vecIn0, vecIn1, vecSum, vecDiff;
477     f32x4_t         vecCmplxTmp, vecTw;
478 
479 
480     n2 = fftLen >> 1;
481     pIn0 = pSrc;
482     pIn1 = pSrc + fftLen;
483     pCoefVec = pCoef;
484 
485     blkCnt = n2 / 2;
486     while (blkCnt > 0U)
487     {
488         vecIn0 = *(f32x4_t *) pIn0;
489         vecIn1 = *(f32x4_t *) pIn1;
490         vecTw = vld1q(pCoefVec);
491         pCoefVec += 4;
492 
493         vecSum = vecIn0 + vecIn1;
494         vecDiff = vecIn0 - vecIn1;
495 
496         vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
497 
498         vst1q(pIn0, vecSum);
499         pIn0 += 4;
500         vst1q(pIn1, vecCmplxTmp);
501         pIn1 += 4;
502 
503         blkCnt--;
504     }
505 
506     _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, n2, onebyfftLen);
507 
508     _arm_radix4_butterfly_inverse_f32_mve(S, pSrc + fftLen, n2, onebyfftLen);
509 }
510 
511 
512 /**
513   @addtogroup ComplexFFTF32
514   @{
515  */
516 
517 /**
518   @brief         Processing function for the floating-point complex FFT.
519   @param[in]     S              points to an instance of the floating-point CFFT structure
520   @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
521   @param[in]     ifftFlag       flag that selects transform direction
522                    - value = 0: forward transform
523                    - value = 1: inverse transform
524   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
525                    - value = 0: disables bit reversal of output
526                    - value = 1: enables bit reversal of output
527  */
528 
529 
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)530 ARM_DSP_ATTRIBUTE void arm_cfft_f32(
531   const arm_cfft_instance_f32 * S,
532         float32_t * pSrc,
533         uint8_t ifftFlag,
534         uint8_t bitReverseFlag)
535 {
536         uint32_t fftLen = S->fftLen;
537 
538         if (ifftFlag == 1U) {
539 
540             switch (fftLen) {
541             case 16:
542             case 64:
543             case 256:
544             case 1024:
545             case 4096:
546                 _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
547                 break;
548 
549             case 32:
550             case 128:
551             case 512:
552             case 2048:
553                 arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
554                 break;
555             }
556         } else {
557             switch (fftLen) {
558             case 16:
559             case 64:
560             case 256:
561             case 1024:
562             case 4096:
563                 _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
564                 break;
565 
566             case 32:
567             case 128:
568             case 512:
569             case 2048:
570                 arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
571                 break;
572             }
573         }
574 
575 
576         if (bitReverseFlag)
577         {
578 
579             arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
580 
581         }
582 }
583 
584 
585 #else
586 extern void arm_radix8_butterfly_f32(
587         float32_t * pSrc,
588         uint16_t fftLen,
589   const float32_t * pCoef,
590         uint16_t twidCoefModifier);
591 
592 extern void arm_bitreversal_32(
593         uint32_t * pSrc,
594   const uint16_t bitRevLen,
595   const uint16_t * pBitRevTable);
596 
597 /**
598   @ingroup groupTransforms
599  */
600 
601 /**
602   @defgroup ComplexFFT Complex FFT Functions
603 
604   @par
605                    The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
606                    Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
607                    than the DFT, especially for long lengths.
608                    The algorithms described in this section
609                    operate on complex data.  A separate set of functions is devoted to handling
610                    of real sequences.
611   @par
612                    There are separate algorithms for handling floating-point, Q15, and Q31 data
613                    types.  The algorithms available for each data type are described next.
614   @par
615                    The FFT functions operate in-place.  That is, the array holding the input data
616                    will also be used to hold the corresponding result.  The input data is complex
617                    and contains <code>2*fftLen</code> interleaved values as shown below.
618                    <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
619                    The FFT result will be contained in the same array and the frequency domain
620                    values will have the same interleaving.
621 
622   @par Floating-point
623                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
624                    stages are performed along with a single radix-2 or radix-4 stage, as needed.
625                    The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
626                    a different twiddle factor table.
627   @par
628                    The function uses the standard FFT definition and output values may grow by a
629                    factor of <code>fftLen</code> when computing the forward transform.  The
630                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
631                    calculation and this matches the textbook definition of the inverse FFT.
632   @par
633                    For the MVE version, the new arm_cfft_init_f32 initialization function is
634                    <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
635                    needed FFTs.</b> Other FFT versions can continue to be initialized as
636                    explained below.
637   @par
638                    For not MVE versions, pre-initialized data structures containing twiddle factors
639                    and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>.  Include
640                    this header in your function and then pass one of the constant structures as
641                    an argument to arm_cfft_f32.  For example:
642   @par
643                    <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
644   @par
645                    computes a 64-point inverse complex FFT including bit reversal.
646                    The data structures are treated as constant data and not modified during the
647                    calculation.  The same data structure can be reused for multiple transforms
648                    including mixing forward and inverse transforms.
649   @par
650                    Earlier releases of the library provided separate radix-2 and radix-4
651                    algorithms that operated on floating-point data.  These functions are still
652                    provided but are deprecated.  The older functions are slower and less general
653                    than the new functions.
654   @par
655                    An example of initialization of the constants for the arm_cfft_f32 function follows:
656   @code
657                    const static arm_cfft_instance_f32 *S;
658                    ...
659                    switch (length) {
660                      case 16:
661                        S = &arm_cfft_sR_f32_len16;
662                        break;
663                      case 32:
664                        S = &arm_cfft_sR_f32_len32;
665                        break;
666                      case 64:
667                        S = &arm_cfft_sR_f32_len64;
668                        break;
669                      case 128:
670                        S = &arm_cfft_sR_f32_len128;
671                        break;
672                      case 256:
673                        S = &arm_cfft_sR_f32_len256;
674                        break;
675                      case 512:
676                        S = &arm_cfft_sR_f32_len512;
677                        break;
678                      case 1024:
679                        S = &arm_cfft_sR_f32_len1024;
680                        break;
681                      case 2048:
682                        S = &arm_cfft_sR_f32_len2048;
683                        break;
684                      case 4096:
685                        S = &arm_cfft_sR_f32_len4096;
686                        break;
687                    }
688   @endcode
689 
690   @par
691                    The new arm_cfft_init_f32 can also be used.
692   @par Q15 and Q31
693                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-4
694                    stages are performed along with a single radix-2 stage, as needed.
695                    The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
696                    a different twiddle factor table.
697   @par
698                    The function uses the standard FFT definition and output values may grow by a
699                    factor of <code>fftLen</code> when computing the forward transform.  The
700                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
701                    calculation and this matches the textbook definition of the inverse FFT.
702   @par
703                    Pre-initialized data structures containing twiddle factors and bit reversal
704                    tables are provided and defined in <code>arm_const_structs.h</code>.  Include
705                    this header in your function and then pass one of the constant structures as
706                    an argument to arm_cfft_q31. For example:
707   @par
708                    <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
709   @par
710                    computes a 64-point inverse complex FFT including bit reversal.
711                    The data structures are treated as constant data and not modified during the
712                    calculation.  The same data structure can be reused for multiple transforms
713                    including mixing forward and inverse transforms.
714   @par
715                    Earlier releases of the library provided separate radix-2 and radix-4
716                    algorithms that operated on floating-point data.  These functions are still
717                    provided but are deprecated.  The older functions are slower and less general
718                    than the new functions.
719   @par
720                    An example of initialization of the constants for the arm_cfft_q31 function follows:
721   @code
722                    const static arm_cfft_instance_q31 *S;
723                    ...
724                      switch (length) {
725                        case 16:
726                          S = &arm_cfft_sR_q31_len16;
727                          break;
728                        case 32:
729                          S = &arm_cfft_sR_q31_len32;
730                          break;
731                        case 64:
732                          S = &arm_cfft_sR_q31_len64;
733                          break;
734                        case 128:
735                          S = &arm_cfft_sR_q31_len128;
736                          break;
737                        case 256:
738                          S = &arm_cfft_sR_q31_len256;
739                          break;
740                        case 512:
741                          S = &arm_cfft_sR_q31_len512;
742                          break;
743                        case 1024:
744                          S = &arm_cfft_sR_q31_len1024;
745                          break;
746                        case 2048:
747                          S = &arm_cfft_sR_q31_len2048;
748                          break;
749                        case 4096:
750                          S = &arm_cfft_sR_q31_len4096;
751                          break;
752                      }
753   @endcode
754 
755  */
756 
arm_cfft_radix8by2_f32(arm_cfft_instance_f32 * S,float32_t * p1)757 static void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
758 {
759   uint32_t    L  = S->fftLen;
760   float32_t * pCol1, * pCol2, * pMid1, * pMid2;
761   float32_t * p2 = p1 + L;
762   const float32_t * tw = (float32_t *) S->pTwiddle;
763   float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
764   float32_t m0, m1, m2, m3;
765   uint32_t l;
766 
767   pCol1 = p1;
768   pCol2 = p2;
769 
770   /* Define new length */
771   L >>= 1;
772 
773   /* Initialize mid pointers */
774   pMid1 = p1 + L;
775   pMid2 = p2 + L;
776 
777   /* do two dot Fourier transform */
778   for (l = L >> 2; l > 0; l-- )
779   {
780     t1[0] = p1[0];
781     t1[1] = p1[1];
782     t1[2] = p1[2];
783     t1[3] = p1[3];
784 
785     t2[0] = p2[0];
786     t2[1] = p2[1];
787     t2[2] = p2[2];
788     t2[3] = p2[3];
789 
790     t3[0] = pMid1[0];
791     t3[1] = pMid1[1];
792     t3[2] = pMid1[2];
793     t3[3] = pMid1[3];
794 
795     t4[0] = pMid2[0];
796     t4[1] = pMid2[1];
797     t4[2] = pMid2[2];
798     t4[3] = pMid2[3];
799 
800     *p1++ = t1[0] + t2[0];
801     *p1++ = t1[1] + t2[1];
802     *p1++ = t1[2] + t2[2];
803     *p1++ = t1[3] + t2[3];    /* col 1 */
804 
805     t2[0] = t1[0] - t2[0];
806     t2[1] = t1[1] - t2[1];
807     t2[2] = t1[2] - t2[2];
808     t2[3] = t1[3] - t2[3];    /* for col 2 */
809 
810     *pMid1++ = t3[0] + t4[0];
811     *pMid1++ = t3[1] + t4[1];
812     *pMid1++ = t3[2] + t4[2];
813     *pMid1++ = t3[3] + t4[3]; /* col 1 */
814 
815     t4[0] = t4[0] - t3[0];
816     t4[1] = t4[1] - t3[1];
817     t4[2] = t4[2] - t3[2];
818     t4[3] = t4[3] - t3[3];    /* for col 2 */
819 
820     twR = *tw++;
821     twI = *tw++;
822 
823     /* multiply by twiddle factors */
824     m0 = t2[0] * twR;
825     m1 = t2[1] * twI;
826     m2 = t2[1] * twR;
827     m3 = t2[0] * twI;
828 
829     /* R  =  R  *  Tr - I * Ti */
830     *p2++ = m0 + m1;
831     /* I  =  I  *  Tr + R * Ti */
832     *p2++ = m2 - m3;
833 
834     /* use vertical symmetry */
835     /*  0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
836     m0 = t4[0] * twI;
837     m1 = t4[1] * twR;
838     m2 = t4[1] * twI;
839     m3 = t4[0] * twR;
840 
841     *pMid2++ = m0 - m1;
842     *pMid2++ = m2 + m3;
843 
844     twR = *tw++;
845     twI = *tw++;
846 
847     m0 = t2[2] * twR;
848     m1 = t2[3] * twI;
849     m2 = t2[3] * twR;
850     m3 = t2[2] * twI;
851 
852     *p2++ = m0 + m1;
853     *p2++ = m2 - m3;
854 
855     m0 = t4[2] * twI;
856     m1 = t4[3] * twR;
857     m2 = t4[3] * twI;
858     m3 = t4[2] * twR;
859 
860     *pMid2++ = m0 - m1;
861     *pMid2++ = m2 + m3;
862   }
863 
864   /* first col */
865   arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);
866 
867   /* second col */
868   arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
869 }
870 
arm_cfft_radix8by4_f32(arm_cfft_instance_f32 * S,float32_t * p1)871 static void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
872 {
873     uint32_t    L  = S->fftLen >> 1;
874     float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
875     const float32_t *tw2, *tw3, *tw4;
876     float32_t * p2 = p1 + L;
877     float32_t * p3 = p2 + L;
878     float32_t * p4 = p3 + L;
879     float32_t t2[4], t3[4], t4[4], twR, twI;
880     float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
881     float32_t m0, m1, m2, m3;
882     uint32_t l, twMod2, twMod3, twMod4;
883 
884     pCol1 = p1;         /* points to real values by default */
885     pCol2 = p2;
886     pCol3 = p3;
887     pCol4 = p4;
888     pEnd1 = p2 - 1;     /* points to imaginary values by default */
889     pEnd2 = p3 - 1;
890     pEnd3 = p4 - 1;
891     pEnd4 = pEnd3 + L;
892 
893     tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;
894 
895     L >>= 1;
896 
897     /* do four dot Fourier transform */
898 
899     twMod2 = 2;
900     twMod3 = 4;
901     twMod4 = 6;
902 
903     /* TOP */
904     p1ap3_0 = p1[0] + p3[0];
905     p1sp3_0 = p1[0] - p3[0];
906     p1ap3_1 = p1[1] + p3[1];
907     p1sp3_1 = p1[1] - p3[1];
908 
909     /* col 2 */
910     t2[0] = p1sp3_0 + p2[1] - p4[1];
911     t2[1] = p1sp3_1 - p2[0] + p4[0];
912     /* col 3 */
913     t3[0] = p1ap3_0 - p2[0] - p4[0];
914     t3[1] = p1ap3_1 - p2[1] - p4[1];
915     /* col 4 */
916     t4[0] = p1sp3_0 - p2[1] + p4[1];
917     t4[1] = p1sp3_1 + p2[0] - p4[0];
918     /* col 1 */
919     *p1++ = p1ap3_0 + p2[0] + p4[0];
920     *p1++ = p1ap3_1 + p2[1] + p4[1];
921 
922     /* Twiddle factors are ones */
923     *p2++ = t2[0];
924     *p2++ = t2[1];
925     *p3++ = t3[0];
926     *p3++ = t3[1];
927     *p4++ = t4[0];
928     *p4++ = t4[1];
929 
930     tw2 += twMod2;
931     tw3 += twMod3;
932     tw4 += twMod4;
933 
934     for (l = (L - 2) >> 1; l > 0; l-- )
935     {
936       /* TOP */
937       p1ap3_0 = p1[0] + p3[0];
938       p1sp3_0 = p1[0] - p3[0];
939       p1ap3_1 = p1[1] + p3[1];
940       p1sp3_1 = p1[1] - p3[1];
941       /* col 2 */
942       t2[0] = p1sp3_0 + p2[1] - p4[1];
943       t2[1] = p1sp3_1 - p2[0] + p4[0];
944       /* col 3 */
945       t3[0] = p1ap3_0 - p2[0] - p4[0];
946       t3[1] = p1ap3_1 - p2[1] - p4[1];
947       /* col 4 */
948       t4[0] = p1sp3_0 - p2[1] + p4[1];
949       t4[1] = p1sp3_1 + p2[0] - p4[0];
950       /* col 1 - top */
951       *p1++ = p1ap3_0 + p2[0] + p4[0];
952       *p1++ = p1ap3_1 + p2[1] + p4[1];
953 
954       /* BOTTOM */
955       p1ap3_1 = pEnd1[-1] + pEnd3[-1];
956       p1sp3_1 = pEnd1[-1] - pEnd3[-1];
957       p1ap3_0 = pEnd1[ 0] + pEnd3[0];
958       p1sp3_0 = pEnd1[ 0] - pEnd3[0];
959       /* col 2 */
960       t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
961       t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
962       /* col 3 */
963       t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
964       t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
965       /* col 4 */
966       t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
967       t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
968       /* col 1 - Bottom */
969       *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
970       *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
971 
972       /* COL 2 */
973       /* read twiddle factors */
974       twR = *tw2++;
975       twI = *tw2++;
976       /* multiply by twiddle factors */
977       /*  let    Z1 = a + i(b),   Z2 = c + i(d) */
978       /*   =>  Z1 * Z2  =  (a*c - b*d) + i(b*c + a*d) */
979 
980       /* Top */
981       m0 = t2[0] * twR;
982       m1 = t2[1] * twI;
983       m2 = t2[1] * twR;
984       m3 = t2[0] * twI;
985 
986       *p2++ = m0 + m1;
987       *p2++ = m2 - m3;
988       /* use vertical symmetry col 2 */
989       /* 0.9997 - 0.0245i  <==>  0.0245 - 0.9997i */
990       /* Bottom */
991       m0 = t2[3] * twI;
992       m1 = t2[2] * twR;
993       m2 = t2[2] * twI;
994       m3 = t2[3] * twR;
995 
996       *pEnd2-- = m0 - m1;
997       *pEnd2-- = m2 + m3;
998 
999       /* COL 3 */
1000       twR = tw3[0];
1001       twI = tw3[1];
1002       tw3 += twMod3;
1003       /* Top */
1004       m0 = t3[0] * twR;
1005       m1 = t3[1] * twI;
1006       m2 = t3[1] * twR;
1007       m3 = t3[0] * twI;
1008 
1009       *p3++ = m0 + m1;
1010       *p3++ = m2 - m3;
1011       /* use vertical symmetry col 3 */
1012       /* 0.9988 - 0.0491i  <==>  -0.9988 - 0.0491i */
1013       /* Bottom */
1014       m0 = -t3[3] * twR;
1015       m1 =  t3[2] * twI;
1016       m2 =  t3[2] * twR;
1017       m3 =  t3[3] * twI;
1018 
1019       *pEnd3-- = m0 - m1;
1020       *pEnd3-- = m3 - m2;
1021 
1022       /* COL 4 */
1023       twR = tw4[0];
1024       twI = tw4[1];
1025       tw4 += twMod4;
1026       /* Top */
1027       m0 = t4[0] * twR;
1028       m1 = t4[1] * twI;
1029       m2 = t4[1] * twR;
1030       m3 = t4[0] * twI;
1031 
1032       *p4++ = m0 + m1;
1033       *p4++ = m2 - m3;
1034       /* use vertical symmetry col 4 */
1035       /* 0.9973 - 0.0736i  <==>  -0.0736 + 0.9973i */
1036       /* Bottom */
1037       m0 = t4[3] * twI;
1038       m1 = t4[2] * twR;
1039       m2 = t4[2] * twI;
1040       m3 = t4[3] * twR;
1041 
1042       *pEnd4-- = m0 - m1;
1043       *pEnd4-- = m2 + m3;
1044     }
1045 
1046     /* MIDDLE */
1047     /* Twiddle factors are */
1048     /*  1.0000  0.7071-0.7071i  -1.0000i  -0.7071-0.7071i */
1049     p1ap3_0 = p1[0] + p3[0];
1050     p1sp3_0 = p1[0] - p3[0];
1051     p1ap3_1 = p1[1] + p3[1];
1052     p1sp3_1 = p1[1] - p3[1];
1053 
1054     /* col 2 */
1055     t2[0] = p1sp3_0 + p2[1] - p4[1];
1056     t2[1] = p1sp3_1 - p2[0] + p4[0];
1057     /* col 3 */
1058     t3[0] = p1ap3_0 - p2[0] - p4[0];
1059     t3[1] = p1ap3_1 - p2[1] - p4[1];
1060     /* col 4 */
1061     t4[0] = p1sp3_0 - p2[1] + p4[1];
1062     t4[1] = p1sp3_1 + p2[0] - p4[0];
1063     /* col 1 - Top */
1064     *p1++ = p1ap3_0 + p2[0] + p4[0];
1065     *p1++ = p1ap3_1 + p2[1] + p4[1];
1066 
1067     /* COL 2 */
1068     twR = tw2[0];
1069     twI = tw2[1];
1070 
1071     m0 = t2[0] * twR;
1072     m1 = t2[1] * twI;
1073     m2 = t2[1] * twR;
1074     m3 = t2[0] * twI;
1075 
1076     *p2++ = m0 + m1;
1077     *p2++ = m2 - m3;
1078     /* COL 3 */
1079     twR = tw3[0];
1080     twI = tw3[1];
1081 
1082     m0 = t3[0] * twR;
1083     m1 = t3[1] * twI;
1084     m2 = t3[1] * twR;
1085     m3 = t3[0] * twI;
1086 
1087     *p3++ = m0 + m1;
1088     *p3++ = m2 - m3;
1089     /* COL 4 */
1090     twR = tw4[0];
1091     twI = tw4[1];
1092 
1093     m0 = t4[0] * twR;
1094     m1 = t4[1] * twI;
1095     m2 = t4[1] * twR;
1096     m3 = t4[0] * twI;
1097 
1098     *p4++ = m0 + m1;
1099     *p4++ = m2 - m3;
1100 
1101     /* first col */
1102     arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);
1103 
1104     /* second col */
1105     arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);
1106 
1107     /* third col */
1108     arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);
1109 
1110     /* fourth col */
1111     arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
1112 }
1113 
1114 /**
1115   @addtogroup ComplexFFTF32
1116   @{
1117  */
1118 
1119 /**
1120   @brief         Processing function for the floating-point complex FFT.
1121   @param[in]     S              points to an instance of the floating-point CFFT structure
1122   @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
1123   @param[in]     ifftFlag       flag that selects transform direction
1124                    - value = 0: forward transform
1125                    - value = 1: inverse transform
1126   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
1127                    - value = 0: disables bit reversal of output
1128                    - value = 1: enables bit reversal of output
1129  */
1130 
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)1131 ARM_DSP_ATTRIBUTE void arm_cfft_f32(
1132   const arm_cfft_instance_f32 * S,
1133         float32_t * p1,
1134         uint8_t ifftFlag,
1135         uint8_t bitReverseFlag)
1136 {
1137   uint32_t  L = S->fftLen, l;
1138   float32_t invL, * pSrc;
1139 
1140   if (ifftFlag == 1U)
1141   {
1142     /* Conjugate input data */
1143     pSrc = p1 + 1;
1144     for (l = 0; l < L; l++)
1145     {
1146       *pSrc = -*pSrc;
1147       pSrc += 2;
1148     }
1149   }
1150 
1151   switch (L)
1152   {
1153   case 16:
1154   case 128:
1155   case 1024:
1156     arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
1157     break;
1158   case 32:
1159   case 256:
1160   case 2048:
1161     arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
1162     break;
1163   case 64:
1164   case 512:
1165   case 4096:
1166     arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1);
1167     break;
1168   }
1169 
1170   if ( bitReverseFlag )
1171     arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
1172 
1173   if (ifftFlag == 1U)
1174   {
1175     invL = 1.0f / (float32_t)L;
1176 
1177     /* Conjugate and scale output data */
1178     pSrc = p1;
1179     for (l= 0; l < L; l++)
1180     {
1181       *pSrc++ *=   invL ;
1182       *pSrc    = -(*pSrc) * invL;
1183       pSrc++;
1184     }
1185   }
1186 }
1187 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1188 
1189 /**
1190   @} end of ComplexFFTF32 group
1191  */
1192