1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cfft_f32.c
4 * Description: Combined Radix Decimation in Frequency CFFT Floating point processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions.h"
30 #include "arm_common_tables.h"
31
32 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
33
34 #include "arm_helium_utils.h"
35 #include "arm_vec_fft.h"
36 #include "arm_mve_tables.h"
37
38
arm_inverse_fft_length_f32(uint16_t fftLen)39 static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
40 {
41 float32_t retValue=1.0;
42
43 switch (fftLen)
44 {
45
46 case 4096U:
47 retValue = 0.000244140625;
48 break;
49
50 case 2048U:
51 retValue = 0.00048828125;
52 break;
53
54 case 1024U:
55 retValue = 0.0009765625f;
56 break;
57
58 case 512U:
59 retValue = 0.001953125;
60 break;
61
62 case 256U:
63 retValue = 0.00390625f;
64 break;
65
66 case 128U:
67 retValue = 0.0078125;
68 break;
69
70 case 64U:
71 retValue = 0.015625f;
72 break;
73
74 case 32U:
75 retValue = 0.03125;
76 break;
77
78 case 16U:
79 retValue = 0.0625f;
80 break;
81
82
83 default:
84 break;
85 }
86 return(retValue);
87 }
88
89
90
91
_arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)92 static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
93 {
94 f32x4_t vecTmp0, vecTmp1;
95 f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
96 f32x4_t vecA, vecB, vecC, vecD;
97 uint32_t blkCnt;
98 uint32_t n1, n2;
99 uint32_t stage = 0;
100 int32_t iter = 1;
101 static const int32_t strides[4] = {
102 (0 - 16) * (int32_t)sizeof(q31_t *),
103 (1 - 16) * (int32_t)sizeof(q31_t *),
104 (8 - 16) * (int32_t)sizeof(q31_t *),
105 (9 - 16) * (int32_t)sizeof(q31_t *)
106 };
107
108 n2 = fftLen;
109 n1 = n2;
110 n2 >>= 2u;
111 for (int k = fftLen / 4u; k > 1; k >>= 2)
112 {
113 float32_t const *p_rearranged_twiddle_tab_stride1 =
114 &S->rearranged_twiddle_stride1[
115 S->rearranged_twiddle_tab_stride1_arr[stage]];
116 float32_t const *p_rearranged_twiddle_tab_stride2 =
117 &S->rearranged_twiddle_stride2[
118 S->rearranged_twiddle_tab_stride2_arr[stage]];
119 float32_t const *p_rearranged_twiddle_tab_stride3 =
120 &S->rearranged_twiddle_stride3[
121 S->rearranged_twiddle_tab_stride3_arr[stage]];
122
123 float32_t * pBase = pSrc;
124 for (int i = 0; i < iter; i++)
125 {
126 float32_t *inA = pBase;
127 float32_t *inB = inA + n2 * CMPLX_DIM;
128 float32_t *inC = inB + n2 * CMPLX_DIM;
129 float32_t *inD = inC + n2 * CMPLX_DIM;
130 float32_t const *pW1 = p_rearranged_twiddle_tab_stride1;
131 float32_t const *pW2 = p_rearranged_twiddle_tab_stride2;
132 float32_t const *pW3 = p_rearranged_twiddle_tab_stride3;
133 f32x4_t vecW;
134
135 blkCnt = n2 / 2;
136 /*
137 * load 2 f32 complex pair
138 */
139 vecA = vldrwq_f32(inA);
140 vecC = vldrwq_f32(inC);
141 while (blkCnt > 0U)
142 {
143 vecB = vldrwq_f32(inB);
144 vecD = vldrwq_f32(inD);
145
146 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
147 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
148
149 vecSum1 = vecB + vecD;
150 vecDiff1 = vecB - vecD;
151 /*
152 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
153 */
154 vecTmp0 = vecSum0 + vecSum1;
155 vst1q(inA, vecTmp0);
156 inA += 4;
157
158 /*
159 * [ 1 -1 1 -1 ] * [ A B C D ]'
160 */
161 vecTmp0 = vecSum0 - vecSum1;
162 /*
163 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
164 */
165 vecW = vld1q(pW2);
166 pW2 += 4;
167 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
168 vst1q(inB, vecTmp1);
169 inB += 4;
170
171 /*
172 * [ 1 -i -1 +i ] * [ A B C D ]'
173 */
174 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
175 /*
176 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
177 */
178 vecW = vld1q(pW1);
179 pW1 +=4;
180 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
181 vst1q(inC, vecTmp1);
182 inC += 4;
183
184 /*
185 * [ 1 +i -1 -i ] * [ A B C D ]'
186 */
187 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
188 /*
189 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
190 */
191 vecW = vld1q(pW3);
192 pW3 += 4;
193 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
194 vst1q(inD, vecTmp1);
195 inD += 4;
196
197 vecA = vldrwq_f32(inA);
198 vecC = vldrwq_f32(inC);
199
200 blkCnt--;
201 }
202 pBase += CMPLX_DIM * n1;
203 }
204 n1 = n2;
205 n2 >>= 2u;
206 iter = iter << 2;
207 stage++;
208 }
209
210 /*
211 * start of Last stage process
212 */
213 uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
214 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
215
216 /* load scheduling */
217 vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
218 vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
219
220 blkCnt = (fftLen >> 3);
221 while (blkCnt > 0U)
222 {
223 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
224 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
225
226 vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
227 vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
228
229 vecSum1 = vecB + vecD;
230 vecDiff1 = vecB - vecD;
231
232 /* pre-load for next iteration */
233 vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
234 vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
235
236 vecTmp0 = vecSum0 + vecSum1;
237 vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
238
239 vecTmp0 = vecSum0 - vecSum1;
240 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
241
242 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
243 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
244
245 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
246 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
247
248 blkCnt--;
249 }
250
251 /*
252 * End of last stage process
253 */
254 }
255
arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)256 static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_t *pSrc, uint32_t fftLen)
257 {
258 float32_t const *pCoefVec;
259 float32_t const *pCoef = S->pTwiddle;
260 float32_t *pIn0, *pIn1;
261 uint32_t n2;
262 uint32_t blkCnt;
263 f32x4_t vecIn0, vecIn1, vecSum, vecDiff;
264 f32x4_t vecCmplxTmp, vecTw;
265
266
267 n2 = fftLen >> 1;
268 pIn0 = pSrc;
269 pIn1 = pSrc + fftLen;
270 pCoefVec = pCoef;
271
272 blkCnt = n2 / 2;
273 while (blkCnt > 0U)
274 {
275 vecIn0 = *(f32x4_t *) pIn0;
276 vecIn1 = *(f32x4_t *) pIn1;
277 vecTw = vld1q(pCoefVec);
278 pCoefVec += 4;
279
280 vecSum = vecIn0 + vecIn1;
281 vecDiff = vecIn0 - vecIn1;
282
283 vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
284
285 vst1q(pIn0, vecSum);
286 pIn0 += 4;
287 vst1q(pIn1, vecCmplxTmp);
288 pIn1 += 4;
289
290 blkCnt--;
291 }
292
293 _arm_radix4_butterfly_f32_mve(S, pSrc, n2);
294
295 _arm_radix4_butterfly_f32_mve(S, pSrc + fftLen, n2);
296
297 pIn0 = pSrc;
298 }
299
_arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen,float32_t onebyfftLen)300 static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen, float32_t onebyfftLen)
301 {
302 f32x4_t vecTmp0, vecTmp1;
303 f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
304 f32x4_t vecA, vecB, vecC, vecD;
305 uint32_t blkCnt;
306 uint32_t n1, n2;
307 uint32_t stage = 0;
308 int32_t iter = 1;
309 static const int32_t strides[4] = {
310 (0 - 16) * (int32_t)sizeof(q31_t *),
311 (1 - 16) * (int32_t)sizeof(q31_t *),
312 (8 - 16) * (int32_t)sizeof(q31_t *),
313 (9 - 16) * (int32_t)sizeof(q31_t *)
314 };
315
316 n2 = fftLen;
317 n1 = n2;
318 n2 >>= 2u;
319 for (int k = fftLen / 4; k > 1; k >>= 2)
320 {
321 float32_t const *p_rearranged_twiddle_tab_stride1 =
322 &S->rearranged_twiddle_stride1[
323 S->rearranged_twiddle_tab_stride1_arr[stage]];
324 float32_t const *p_rearranged_twiddle_tab_stride2 =
325 &S->rearranged_twiddle_stride2[
326 S->rearranged_twiddle_tab_stride2_arr[stage]];
327 float32_t const *p_rearranged_twiddle_tab_stride3 =
328 &S->rearranged_twiddle_stride3[
329 S->rearranged_twiddle_tab_stride3_arr[stage]];
330
331 float32_t * pBase = pSrc;
332 for (int i = 0; i < iter; i++)
333 {
334 float32_t *inA = pBase;
335 float32_t *inB = inA + n2 * CMPLX_DIM;
336 float32_t *inC = inB + n2 * CMPLX_DIM;
337 float32_t *inD = inC + n2 * CMPLX_DIM;
338 float32_t const *pW1 = p_rearranged_twiddle_tab_stride1;
339 float32_t const *pW2 = p_rearranged_twiddle_tab_stride2;
340 float32_t const *pW3 = p_rearranged_twiddle_tab_stride3;
341 f32x4_t vecW;
342
343 blkCnt = n2 / 2;
344 /*
345 * load 2 f32 complex pair
346 */
347 vecA = vldrwq_f32(inA);
348 vecC = vldrwq_f32(inC);
349 while (blkCnt > 0U)
350 {
351 vecB = vldrwq_f32(inB);
352 vecD = vldrwq_f32(inD);
353
354 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
355 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
356
357 vecSum1 = vecB + vecD;
358 vecDiff1 = vecB - vecD;
359 /*
360 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
361 */
362 vecTmp0 = vecSum0 + vecSum1;
363 vst1q(inA, vecTmp0);
364 inA += 4;
365 /*
366 * [ 1 -1 1 -1 ] * [ A B C D ]'
367 */
368 vecTmp0 = vecSum0 - vecSum1;
369 /*
370 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
371 */
372 vecW = vld1q(pW2);
373 pW2 += 4;
374 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
375 vst1q(inB, vecTmp1);
376 inB += 4;
377
378 /*
379 * [ 1 -i -1 +i ] * [ A B C D ]'
380 */
381 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
382 /*
383 * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
384 */
385 vecW = vld1q(pW1);
386 pW1 += 4;
387 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
388 vst1q(inC, vecTmp1);
389 inC += 4;
390
391 /*
392 * [ 1 +i -1 -i ] * [ A B C D ]'
393 */
394 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
395 /*
396 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
397 */
398 vecW = vld1q(pW3);
399 pW3 += 4;
400 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
401 vst1q(inD, vecTmp1);
402 inD += 4;
403
404 vecA = vldrwq_f32(inA);
405 vecC = vldrwq_f32(inC);
406
407 blkCnt--;
408 }
409 pBase += CMPLX_DIM * n1;
410 }
411 n1 = n2;
412 n2 >>= 2u;
413 iter = iter << 2;
414 stage++;
415 }
416
417 /*
418 * start of Last stage process
419 */
420 uint32x4_t vecScGathAddr = vld1q_u32 ((uint32_t*)strides);
421 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
422
423 /*
424 * load scheduling
425 */
426 vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
427 vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
428
429 blkCnt = (fftLen >> 3);
430 while (blkCnt > 0U)
431 {
432 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
433 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
434
435 vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
436 vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
437
438 vecSum1 = vecB + vecD;
439 vecDiff1 = vecB - vecD;
440
441 vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
442 vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
443
444 vecTmp0 = vecSum0 + vecSum1;
445 vecTmp0 = vecTmp0 * onebyfftLen;
446 vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
447
448 vecTmp0 = vecSum0 - vecSum1;
449 vecTmp0 = vecTmp0 * onebyfftLen;
450 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
451
452 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
453 vecTmp0 = vecTmp0 * onebyfftLen;
454 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
455
456 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
457 vecTmp0 = vecTmp0 * onebyfftLen;
458 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
459
460 blkCnt--;
461 }
462
463 /*
464 * End of last stage process
465 */
466 }
467
arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)468 static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t *pSrc, uint32_t fftLen)
469 {
470 float32_t const *pCoefVec;
471 float32_t const *pCoef = S->pTwiddle;
472 float32_t *pIn0, *pIn1;
473 uint32_t n2;
474 float32_t onebyfftLen = arm_inverse_fft_length_f32(fftLen);
475 uint32_t blkCnt;
476 f32x4_t vecIn0, vecIn1, vecSum, vecDiff;
477 f32x4_t vecCmplxTmp, vecTw;
478
479
480 n2 = fftLen >> 1;
481 pIn0 = pSrc;
482 pIn1 = pSrc + fftLen;
483 pCoefVec = pCoef;
484
485 blkCnt = n2 / 2;
486 while (blkCnt > 0U)
487 {
488 vecIn0 = *(f32x4_t *) pIn0;
489 vecIn1 = *(f32x4_t *) pIn1;
490 vecTw = vld1q(pCoefVec);
491 pCoefVec += 4;
492
493 vecSum = vecIn0 + vecIn1;
494 vecDiff = vecIn0 - vecIn1;
495
496 vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
497
498 vst1q(pIn0, vecSum);
499 pIn0 += 4;
500 vst1q(pIn1, vecCmplxTmp);
501 pIn1 += 4;
502
503 blkCnt--;
504 }
505
506 _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, n2, onebyfftLen);
507
508 _arm_radix4_butterfly_inverse_f32_mve(S, pSrc + fftLen, n2, onebyfftLen);
509 }
510
511
512 /**
513 @addtogroup ComplexFFTF32
514 @{
515 */
516
517 /**
518 @brief Processing function for the floating-point complex FFT.
519 @param[in] S points to an instance of the floating-point CFFT structure
520 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
521 @param[in] ifftFlag flag that selects transform direction
522 - value = 0: forward transform
523 - value = 1: inverse transform
524 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
525 - value = 0: disables bit reversal of output
526 - value = 1: enables bit reversal of output
527 */
528
529
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)530 ARM_DSP_ATTRIBUTE void arm_cfft_f32(
531 const arm_cfft_instance_f32 * S,
532 float32_t * pSrc,
533 uint8_t ifftFlag,
534 uint8_t bitReverseFlag)
535 {
536 uint32_t fftLen = S->fftLen;
537
538 if (ifftFlag == 1U) {
539
540 switch (fftLen) {
541 case 16:
542 case 64:
543 case 256:
544 case 1024:
545 case 4096:
546 _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
547 break;
548
549 case 32:
550 case 128:
551 case 512:
552 case 2048:
553 arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
554 break;
555 }
556 } else {
557 switch (fftLen) {
558 case 16:
559 case 64:
560 case 256:
561 case 1024:
562 case 4096:
563 _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
564 break;
565
566 case 32:
567 case 128:
568 case 512:
569 case 2048:
570 arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
571 break;
572 }
573 }
574
575
576 if (bitReverseFlag)
577 {
578
579 arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
580
581 }
582 }
583
584
585 #else
586 extern void arm_radix8_butterfly_f32(
587 float32_t * pSrc,
588 uint16_t fftLen,
589 const float32_t * pCoef,
590 uint16_t twidCoefModifier);
591
592 extern void arm_bitreversal_32(
593 uint32_t * pSrc,
594 const uint16_t bitRevLen,
595 const uint16_t * pBitRevTable);
596
597 /**
598 @ingroup groupTransforms
599 */
600
601 /**
602 @defgroup ComplexFFT Complex FFT Functions
603
604 @par
605 The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
606 Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
607 than the DFT, especially for long lengths.
608 The algorithms described in this section
609 operate on complex data. A separate set of functions is devoted to handling
610 of real sequences.
611 @par
612 There are separate algorithms for handling floating-point, Q15, and Q31 data
613 types. The algorithms available for each data type are described next.
614 @par
615 The FFT functions operate in-place. That is, the array holding the input data
616 will also be used to hold the corresponding result. The input data is complex
617 and contains <code>2*fftLen</code> interleaved values as shown below.
618 <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
619 The FFT result will be contained in the same array and the frequency domain
620 values will have the same interleaving.
621
622 @par Floating-point
623 The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
624 stages are performed along with a single radix-2 or radix-4 stage, as needed.
625 The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
626 a different twiddle factor table.
627 @par
628 The function uses the standard FFT definition and output values may grow by a
629 factor of <code>fftLen</code> when computing the forward transform. The
630 inverse transform includes a scale of <code>1/fftLen</code> as part of the
631 calculation and this matches the textbook definition of the inverse FFT.
632 @par
633 For the MVE version, the new arm_cfft_init_f32 initialization function is
634 <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
635 needed FFTs.</b> Other FFT versions can continue to be initialized as
636 explained below.
637 @par
638 For not MVE versions, pre-initialized data structures containing twiddle factors
639 and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include
640 this header in your function and then pass one of the constant structures as
641 an argument to arm_cfft_f32. For example:
642 @par
643 <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
644 @par
645 computes a 64-point inverse complex FFT including bit reversal.
646 The data structures are treated as constant data and not modified during the
647 calculation. The same data structure can be reused for multiple transforms
648 including mixing forward and inverse transforms.
649 @par
650 Earlier releases of the library provided separate radix-2 and radix-4
651 algorithms that operated on floating-point data. These functions are still
652 provided but are deprecated. The older functions are slower and less general
653 than the new functions.
654 @par
655 An example of initialization of the constants for the arm_cfft_f32 function follows:
656 @code
657 const static arm_cfft_instance_f32 *S;
658 ...
659 switch (length) {
660 case 16:
661 S = &arm_cfft_sR_f32_len16;
662 break;
663 case 32:
664 S = &arm_cfft_sR_f32_len32;
665 break;
666 case 64:
667 S = &arm_cfft_sR_f32_len64;
668 break;
669 case 128:
670 S = &arm_cfft_sR_f32_len128;
671 break;
672 case 256:
673 S = &arm_cfft_sR_f32_len256;
674 break;
675 case 512:
676 S = &arm_cfft_sR_f32_len512;
677 break;
678 case 1024:
679 S = &arm_cfft_sR_f32_len1024;
680 break;
681 case 2048:
682 S = &arm_cfft_sR_f32_len2048;
683 break;
684 case 4096:
685 S = &arm_cfft_sR_f32_len4096;
686 break;
687 }
688 @endcode
689
690 @par
691 The new arm_cfft_init_f32 can also be used.
692 @par Q15 and Q31
693 The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4
694 stages are performed along with a single radix-2 stage, as needed.
695 The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
696 a different twiddle factor table.
697 @par
698 The function uses the standard FFT definition and output values may grow by a
699 factor of <code>fftLen</code> when computing the forward transform. The
700 inverse transform includes a scale of <code>1/fftLen</code> as part of the
701 calculation and this matches the textbook definition of the inverse FFT.
702 @par
703 Pre-initialized data structures containing twiddle factors and bit reversal
704 tables are provided and defined in <code>arm_const_structs.h</code>. Include
705 this header in your function and then pass one of the constant structures as
706 an argument to arm_cfft_q31. For example:
707 @par
708 <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
709 @par
710 computes a 64-point inverse complex FFT including bit reversal.
711 The data structures are treated as constant data and not modified during the
712 calculation. The same data structure can be reused for multiple transforms
713 including mixing forward and inverse transforms.
714 @par
715 Earlier releases of the library provided separate radix-2 and radix-4
716 algorithms that operated on floating-point data. These functions are still
717 provided but are deprecated. The older functions are slower and less general
718 than the new functions.
719 @par
720 An example of initialization of the constants for the arm_cfft_q31 function follows:
721 @code
722 const static arm_cfft_instance_q31 *S;
723 ...
724 switch (length) {
725 case 16:
726 S = &arm_cfft_sR_q31_len16;
727 break;
728 case 32:
729 S = &arm_cfft_sR_q31_len32;
730 break;
731 case 64:
732 S = &arm_cfft_sR_q31_len64;
733 break;
734 case 128:
735 S = &arm_cfft_sR_q31_len128;
736 break;
737 case 256:
738 S = &arm_cfft_sR_q31_len256;
739 break;
740 case 512:
741 S = &arm_cfft_sR_q31_len512;
742 break;
743 case 1024:
744 S = &arm_cfft_sR_q31_len1024;
745 break;
746 case 2048:
747 S = &arm_cfft_sR_q31_len2048;
748 break;
749 case 4096:
750 S = &arm_cfft_sR_q31_len4096;
751 break;
752 }
753 @endcode
754
755 */
756
arm_cfft_radix8by2_f32(arm_cfft_instance_f32 * S,float32_t * p1)757 static void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
758 {
759 uint32_t L = S->fftLen;
760 float32_t * pCol1, * pCol2, * pMid1, * pMid2;
761 float32_t * p2 = p1 + L;
762 const float32_t * tw = (float32_t *) S->pTwiddle;
763 float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
764 float32_t m0, m1, m2, m3;
765 uint32_t l;
766
767 pCol1 = p1;
768 pCol2 = p2;
769
770 /* Define new length */
771 L >>= 1;
772
773 /* Initialize mid pointers */
774 pMid1 = p1 + L;
775 pMid2 = p2 + L;
776
777 /* do two dot Fourier transform */
778 for (l = L >> 2; l > 0; l-- )
779 {
780 t1[0] = p1[0];
781 t1[1] = p1[1];
782 t1[2] = p1[2];
783 t1[3] = p1[3];
784
785 t2[0] = p2[0];
786 t2[1] = p2[1];
787 t2[2] = p2[2];
788 t2[3] = p2[3];
789
790 t3[0] = pMid1[0];
791 t3[1] = pMid1[1];
792 t3[2] = pMid1[2];
793 t3[3] = pMid1[3];
794
795 t4[0] = pMid2[0];
796 t4[1] = pMid2[1];
797 t4[2] = pMid2[2];
798 t4[3] = pMid2[3];
799
800 *p1++ = t1[0] + t2[0];
801 *p1++ = t1[1] + t2[1];
802 *p1++ = t1[2] + t2[2];
803 *p1++ = t1[3] + t2[3]; /* col 1 */
804
805 t2[0] = t1[0] - t2[0];
806 t2[1] = t1[1] - t2[1];
807 t2[2] = t1[2] - t2[2];
808 t2[3] = t1[3] - t2[3]; /* for col 2 */
809
810 *pMid1++ = t3[0] + t4[0];
811 *pMid1++ = t3[1] + t4[1];
812 *pMid1++ = t3[2] + t4[2];
813 *pMid1++ = t3[3] + t4[3]; /* col 1 */
814
815 t4[0] = t4[0] - t3[0];
816 t4[1] = t4[1] - t3[1];
817 t4[2] = t4[2] - t3[2];
818 t4[3] = t4[3] - t3[3]; /* for col 2 */
819
820 twR = *tw++;
821 twI = *tw++;
822
823 /* multiply by twiddle factors */
824 m0 = t2[0] * twR;
825 m1 = t2[1] * twI;
826 m2 = t2[1] * twR;
827 m3 = t2[0] * twI;
828
829 /* R = R * Tr - I * Ti */
830 *p2++ = m0 + m1;
831 /* I = I * Tr + R * Ti */
832 *p2++ = m2 - m3;
833
834 /* use vertical symmetry */
835 /* 0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
836 m0 = t4[0] * twI;
837 m1 = t4[1] * twR;
838 m2 = t4[1] * twI;
839 m3 = t4[0] * twR;
840
841 *pMid2++ = m0 - m1;
842 *pMid2++ = m2 + m3;
843
844 twR = *tw++;
845 twI = *tw++;
846
847 m0 = t2[2] * twR;
848 m1 = t2[3] * twI;
849 m2 = t2[3] * twR;
850 m3 = t2[2] * twI;
851
852 *p2++ = m0 + m1;
853 *p2++ = m2 - m3;
854
855 m0 = t4[2] * twI;
856 m1 = t4[3] * twR;
857 m2 = t4[3] * twI;
858 m3 = t4[2] * twR;
859
860 *pMid2++ = m0 - m1;
861 *pMid2++ = m2 + m3;
862 }
863
864 /* first col */
865 arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);
866
867 /* second col */
868 arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
869 }
870
arm_cfft_radix8by4_f32(arm_cfft_instance_f32 * S,float32_t * p1)871 static void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
872 {
873 uint32_t L = S->fftLen >> 1;
874 float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
875 const float32_t *tw2, *tw3, *tw4;
876 float32_t * p2 = p1 + L;
877 float32_t * p3 = p2 + L;
878 float32_t * p4 = p3 + L;
879 float32_t t2[4], t3[4], t4[4], twR, twI;
880 float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
881 float32_t m0, m1, m2, m3;
882 uint32_t l, twMod2, twMod3, twMod4;
883
884 pCol1 = p1; /* points to real values by default */
885 pCol2 = p2;
886 pCol3 = p3;
887 pCol4 = p4;
888 pEnd1 = p2 - 1; /* points to imaginary values by default */
889 pEnd2 = p3 - 1;
890 pEnd3 = p4 - 1;
891 pEnd4 = pEnd3 + L;
892
893 tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;
894
895 L >>= 1;
896
897 /* do four dot Fourier transform */
898
899 twMod2 = 2;
900 twMod3 = 4;
901 twMod4 = 6;
902
903 /* TOP */
904 p1ap3_0 = p1[0] + p3[0];
905 p1sp3_0 = p1[0] - p3[0];
906 p1ap3_1 = p1[1] + p3[1];
907 p1sp3_1 = p1[1] - p3[1];
908
909 /* col 2 */
910 t2[0] = p1sp3_0 + p2[1] - p4[1];
911 t2[1] = p1sp3_1 - p2[0] + p4[0];
912 /* col 3 */
913 t3[0] = p1ap3_0 - p2[0] - p4[0];
914 t3[1] = p1ap3_1 - p2[1] - p4[1];
915 /* col 4 */
916 t4[0] = p1sp3_0 - p2[1] + p4[1];
917 t4[1] = p1sp3_1 + p2[0] - p4[0];
918 /* col 1 */
919 *p1++ = p1ap3_0 + p2[0] + p4[0];
920 *p1++ = p1ap3_1 + p2[1] + p4[1];
921
922 /* Twiddle factors are ones */
923 *p2++ = t2[0];
924 *p2++ = t2[1];
925 *p3++ = t3[0];
926 *p3++ = t3[1];
927 *p4++ = t4[0];
928 *p4++ = t4[1];
929
930 tw2 += twMod2;
931 tw3 += twMod3;
932 tw4 += twMod4;
933
934 for (l = (L - 2) >> 1; l > 0; l-- )
935 {
936 /* TOP */
937 p1ap3_0 = p1[0] + p3[0];
938 p1sp3_0 = p1[0] - p3[0];
939 p1ap3_1 = p1[1] + p3[1];
940 p1sp3_1 = p1[1] - p3[1];
941 /* col 2 */
942 t2[0] = p1sp3_0 + p2[1] - p4[1];
943 t2[1] = p1sp3_1 - p2[0] + p4[0];
944 /* col 3 */
945 t3[0] = p1ap3_0 - p2[0] - p4[0];
946 t3[1] = p1ap3_1 - p2[1] - p4[1];
947 /* col 4 */
948 t4[0] = p1sp3_0 - p2[1] + p4[1];
949 t4[1] = p1sp3_1 + p2[0] - p4[0];
950 /* col 1 - top */
951 *p1++ = p1ap3_0 + p2[0] + p4[0];
952 *p1++ = p1ap3_1 + p2[1] + p4[1];
953
954 /* BOTTOM */
955 p1ap3_1 = pEnd1[-1] + pEnd3[-1];
956 p1sp3_1 = pEnd1[-1] - pEnd3[-1];
957 p1ap3_0 = pEnd1[ 0] + pEnd3[0];
958 p1sp3_0 = pEnd1[ 0] - pEnd3[0];
959 /* col 2 */
960 t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
961 t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
962 /* col 3 */
963 t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
964 t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
965 /* col 4 */
966 t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
967 t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
968 /* col 1 - Bottom */
969 *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
970 *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
971
972 /* COL 2 */
973 /* read twiddle factors */
974 twR = *tw2++;
975 twI = *tw2++;
976 /* multiply by twiddle factors */
977 /* let Z1 = a + i(b), Z2 = c + i(d) */
978 /* => Z1 * Z2 = (a*c - b*d) + i(b*c + a*d) */
979
980 /* Top */
981 m0 = t2[0] * twR;
982 m1 = t2[1] * twI;
983 m2 = t2[1] * twR;
984 m3 = t2[0] * twI;
985
986 *p2++ = m0 + m1;
987 *p2++ = m2 - m3;
988 /* use vertical symmetry col 2 */
989 /* 0.9997 - 0.0245i <==> 0.0245 - 0.9997i */
990 /* Bottom */
991 m0 = t2[3] * twI;
992 m1 = t2[2] * twR;
993 m2 = t2[2] * twI;
994 m3 = t2[3] * twR;
995
996 *pEnd2-- = m0 - m1;
997 *pEnd2-- = m2 + m3;
998
999 /* COL 3 */
1000 twR = tw3[0];
1001 twI = tw3[1];
1002 tw3 += twMod3;
1003 /* Top */
1004 m0 = t3[0] * twR;
1005 m1 = t3[1] * twI;
1006 m2 = t3[1] * twR;
1007 m3 = t3[0] * twI;
1008
1009 *p3++ = m0 + m1;
1010 *p3++ = m2 - m3;
1011 /* use vertical symmetry col 3 */
1012 /* 0.9988 - 0.0491i <==> -0.9988 - 0.0491i */
1013 /* Bottom */
1014 m0 = -t3[3] * twR;
1015 m1 = t3[2] * twI;
1016 m2 = t3[2] * twR;
1017 m3 = t3[3] * twI;
1018
1019 *pEnd3-- = m0 - m1;
1020 *pEnd3-- = m3 - m2;
1021
1022 /* COL 4 */
1023 twR = tw4[0];
1024 twI = tw4[1];
1025 tw4 += twMod4;
1026 /* Top */
1027 m0 = t4[0] * twR;
1028 m1 = t4[1] * twI;
1029 m2 = t4[1] * twR;
1030 m3 = t4[0] * twI;
1031
1032 *p4++ = m0 + m1;
1033 *p4++ = m2 - m3;
1034 /* use vertical symmetry col 4 */
1035 /* 0.9973 - 0.0736i <==> -0.0736 + 0.9973i */
1036 /* Bottom */
1037 m0 = t4[3] * twI;
1038 m1 = t4[2] * twR;
1039 m2 = t4[2] * twI;
1040 m3 = t4[3] * twR;
1041
1042 *pEnd4-- = m0 - m1;
1043 *pEnd4-- = m2 + m3;
1044 }
1045
1046 /* MIDDLE */
1047 /* Twiddle factors are */
1048 /* 1.0000 0.7071-0.7071i -1.0000i -0.7071-0.7071i */
1049 p1ap3_0 = p1[0] + p3[0];
1050 p1sp3_0 = p1[0] - p3[0];
1051 p1ap3_1 = p1[1] + p3[1];
1052 p1sp3_1 = p1[1] - p3[1];
1053
1054 /* col 2 */
1055 t2[0] = p1sp3_0 + p2[1] - p4[1];
1056 t2[1] = p1sp3_1 - p2[0] + p4[0];
1057 /* col 3 */
1058 t3[0] = p1ap3_0 - p2[0] - p4[0];
1059 t3[1] = p1ap3_1 - p2[1] - p4[1];
1060 /* col 4 */
1061 t4[0] = p1sp3_0 - p2[1] + p4[1];
1062 t4[1] = p1sp3_1 + p2[0] - p4[0];
1063 /* col 1 - Top */
1064 *p1++ = p1ap3_0 + p2[0] + p4[0];
1065 *p1++ = p1ap3_1 + p2[1] + p4[1];
1066
1067 /* COL 2 */
1068 twR = tw2[0];
1069 twI = tw2[1];
1070
1071 m0 = t2[0] * twR;
1072 m1 = t2[1] * twI;
1073 m2 = t2[1] * twR;
1074 m3 = t2[0] * twI;
1075
1076 *p2++ = m0 + m1;
1077 *p2++ = m2 - m3;
1078 /* COL 3 */
1079 twR = tw3[0];
1080 twI = tw3[1];
1081
1082 m0 = t3[0] * twR;
1083 m1 = t3[1] * twI;
1084 m2 = t3[1] * twR;
1085 m3 = t3[0] * twI;
1086
1087 *p3++ = m0 + m1;
1088 *p3++ = m2 - m3;
1089 /* COL 4 */
1090 twR = tw4[0];
1091 twI = tw4[1];
1092
1093 m0 = t4[0] * twR;
1094 m1 = t4[1] * twI;
1095 m2 = t4[1] * twR;
1096 m3 = t4[0] * twI;
1097
1098 *p4++ = m0 + m1;
1099 *p4++ = m2 - m3;
1100
1101 /* first col */
1102 arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);
1103
1104 /* second col */
1105 arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);
1106
1107 /* third col */
1108 arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);
1109
1110 /* fourth col */
1111 arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
1112 }
1113
1114 /**
1115 @addtogroup ComplexFFTF32
1116 @{
1117 */
1118
1119 /**
1120 @brief Processing function for the floating-point complex FFT.
1121 @param[in] S points to an instance of the floating-point CFFT structure
1122 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
1123 @param[in] ifftFlag flag that selects transform direction
1124 - value = 0: forward transform
1125 - value = 1: inverse transform
1126 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
1127 - value = 0: disables bit reversal of output
1128 - value = 1: enables bit reversal of output
1129 */
1130
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)1131 ARM_DSP_ATTRIBUTE void arm_cfft_f32(
1132 const arm_cfft_instance_f32 * S,
1133 float32_t * p1,
1134 uint8_t ifftFlag,
1135 uint8_t bitReverseFlag)
1136 {
1137 uint32_t L = S->fftLen, l;
1138 float32_t invL, * pSrc;
1139
1140 if (ifftFlag == 1U)
1141 {
1142 /* Conjugate input data */
1143 pSrc = p1 + 1;
1144 for (l = 0; l < L; l++)
1145 {
1146 *pSrc = -*pSrc;
1147 pSrc += 2;
1148 }
1149 }
1150
1151 switch (L)
1152 {
1153 case 16:
1154 case 128:
1155 case 1024:
1156 arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
1157 break;
1158 case 32:
1159 case 256:
1160 case 2048:
1161 arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
1162 break;
1163 case 64:
1164 case 512:
1165 case 4096:
1166 arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1);
1167 break;
1168 }
1169
1170 if ( bitReverseFlag )
1171 arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
1172
1173 if (ifftFlag == 1U)
1174 {
1175 invL = 1.0f / (float32_t)L;
1176
1177 /* Conjugate and scale output data */
1178 pSrc = p1;
1179 for (l= 0; l < L; l++)
1180 {
1181 *pSrc++ *= invL ;
1182 *pSrc = -(*pSrc) * invL;
1183 pSrc++;
1184 }
1185 }
1186 }
1187 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1188
1189 /**
1190 @} end of ComplexFFTF32 group
1191 */
1192