1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cfft_f32.c
4 * Description: Combined Radix Decimation in Frequency CFFT Floating point processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions.h"
30 #include "arm_common_tables.h"
31
32 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
33
34 #include "arm_helium_utils.h"
35 #include "arm_vec_fft.h"
36 #include "arm_mve_tables.h"
37
38
arm_inverse_fft_length_f32(uint16_t fftLen)39 static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
40 {
41 float32_t retValue=1.0;
42
43 switch (fftLen)
44 {
45
46 case 4096U:
47 retValue = 0.000244140625;
48 break;
49
50 case 2048U:
51 retValue = 0.00048828125;
52 break;
53
54 case 1024U:
55 retValue = 0.0009765625f;
56 break;
57
58 case 512U:
59 retValue = 0.001953125;
60 break;
61
62 case 256U:
63 retValue = 0.00390625f;
64 break;
65
66 case 128U:
67 retValue = 0.0078125;
68 break;
69
70 case 64U:
71 retValue = 0.015625f;
72 break;
73
74 case 32U:
75 retValue = 0.03125;
76 break;
77
78 case 16U:
79 retValue = 0.0625f;
80 break;
81
82
83 default:
84 break;
85 }
86 return(retValue);
87 }
88
89
90
91
_arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)92 static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
93 {
94 f32x4_t vecTmp0, vecTmp1;
95 f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
96 f32x4_t vecA, vecB, vecC, vecD;
97 uint32_t blkCnt;
98 uint32_t n1, n2;
99 uint32_t stage = 0;
100 int32_t iter = 1;
101 static const uint32_t strides[4] = {
102 (0 - 16) * sizeof(q31_t *),
103 (1 - 16) * sizeof(q31_t *),
104 (8 - 16) * sizeof(q31_t *),
105 (9 - 16) * sizeof(q31_t *)
106 };
107
108 n2 = fftLen;
109 n1 = n2;
110 n2 >>= 2u;
111 for (int k = fftLen / 4u; k > 1; k >>= 2)
112 {
113 for (int i = 0; i < iter; i++)
114 {
115 float32_t const *p_rearranged_twiddle_tab_stride1 =
116 &S->rearranged_twiddle_stride1[
117 S->rearranged_twiddle_tab_stride1_arr[stage]];
118 float32_t const *p_rearranged_twiddle_tab_stride2 =
119 &S->rearranged_twiddle_stride2[
120 S->rearranged_twiddle_tab_stride2_arr[stage]];
121 float32_t const *p_rearranged_twiddle_tab_stride3 =
122 &S->rearranged_twiddle_stride3[
123 S->rearranged_twiddle_tab_stride3_arr[stage]];
124 float32_t const *pW1, *pW2, *pW3;
125 float32_t *inA = pSrc + CMPLX_DIM * i * n1;
126 float32_t *inB = inA + n2 * CMPLX_DIM;
127 float32_t *inC = inB + n2 * CMPLX_DIM;
128 float32_t *inD = inC + n2 * CMPLX_DIM;
129 f32x4_t vecW;
130
131
132 pW1 = p_rearranged_twiddle_tab_stride1;
133 pW2 = p_rearranged_twiddle_tab_stride2;
134 pW3 = p_rearranged_twiddle_tab_stride3;
135
136 blkCnt = n2 / 2;
137 /*
138 * load 2 f32 complex pair
139 */
140 vecA = vldrwq_f32(inA);
141 vecC = vldrwq_f32(inC);
142 while (blkCnt > 0U)
143 {
144 vecB = vldrwq_f32(inB);
145 vecD = vldrwq_f32(inD);
146
147 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
148 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
149
150 vecSum1 = vecB + vecD;
151 vecDiff1 = vecB - vecD;
152 /*
153 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
154 */
155 vecTmp0 = vecSum0 + vecSum1;
156 vst1q(inA, vecTmp0);
157 inA += 4;
158
159 /*
160 * [ 1 -1 1 -1 ] * [ A B C D ]'
161 */
162 vecTmp0 = vecSum0 - vecSum1;
163 /*
164 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
165 */
166 vecW = vld1q(pW2);
167 pW2 += 4;
168 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
169 vst1q(inB, vecTmp1);
170 inB += 4;
171
172 /*
173 * [ 1 -i -1 +i ] * [ A B C D ]'
174 */
175 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
176 /*
177 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
178 */
179 vecW = vld1q(pW1);
180 pW1 +=4;
181 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
182 vst1q(inC, vecTmp1);
183 inC += 4;
184
185 /*
186 * [ 1 +i -1 -i ] * [ A B C D ]'
187 */
188 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
189 /*
190 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
191 */
192 vecW = vld1q(pW3);
193 pW3 += 4;
194 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
195 vst1q(inD, vecTmp1);
196 inD += 4;
197
198 vecA = vldrwq_f32(inA);
199 vecC = vldrwq_f32(inC);
200
201 blkCnt--;
202 }
203 }
204 n1 = n2;
205 n2 >>= 2u;
206 iter = iter << 2;
207 stage++;
208 }
209
210 /*
211 * start of Last stage process
212 */
213 uint32x4_t vecScGathAddr = vld1q_u32(strides);
214 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
215
216 /* load scheduling */
217 vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
218 vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
219
220 blkCnt = (fftLen >> 3);
221 while (blkCnt > 0U)
222 {
223 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
224 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
225
226 vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
227 vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
228
229 vecSum1 = vecB + vecD;
230 vecDiff1 = vecB - vecD;
231
232 /* pre-load for next iteration */
233 vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
234 vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
235
236 vecTmp0 = vecSum0 + vecSum1;
237 vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
238
239 vecTmp0 = vecSum0 - vecSum1;
240 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
241
242 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
243 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
244
245 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
246 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
247
248 blkCnt--;
249 }
250
251 /*
252 * End of last stage process
253 */
254 }
255
arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)256 static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_t *pSrc, uint32_t fftLen)
257 {
258 float32_t const *pCoefVec;
259 float32_t const *pCoef = S->pTwiddle;
260 float32_t *pIn0, *pIn1;
261 uint32_t n2;
262 uint32_t blkCnt;
263 f32x4_t vecIn0, vecIn1, vecSum, vecDiff;
264 f32x4_t vecCmplxTmp, vecTw;
265
266
267 n2 = fftLen >> 1;
268 pIn0 = pSrc;
269 pIn1 = pSrc + fftLen;
270 pCoefVec = pCoef;
271
272 blkCnt = n2 / 2;
273 while (blkCnt > 0U)
274 {
275 vecIn0 = *(f32x4_t *) pIn0;
276 vecIn1 = *(f32x4_t *) pIn1;
277 vecTw = vld1q(pCoefVec);
278 pCoefVec += 4;
279
280 vecSum = vecIn0 + vecIn1;
281 vecDiff = vecIn0 - vecIn1;
282
283 vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
284
285 vst1q(pIn0, vecSum);
286 pIn0 += 4;
287 vst1q(pIn1, vecCmplxTmp);
288 pIn1 += 4;
289
290 blkCnt--;
291 }
292
293 _arm_radix4_butterfly_f32_mve(S, pSrc, n2);
294
295 _arm_radix4_butterfly_f32_mve(S, pSrc + fftLen, n2);
296
297 pIn0 = pSrc;
298 }
299
_arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen,float32_t onebyfftLen)300 static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen, float32_t onebyfftLen)
301 {
302 f32x4_t vecTmp0, vecTmp1;
303 f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
304 f32x4_t vecA, vecB, vecC, vecD;
305 f32x4_t vecW;
306 uint32_t blkCnt;
307 uint32_t n1, n2;
308 uint32_t stage = 0;
309 int32_t iter = 1;
310 static const uint32_t strides[4] = {
311 (0 - 16) * sizeof(q31_t *),
312 (1 - 16) * sizeof(q31_t *),
313 (8 - 16) * sizeof(q31_t *),
314 (9 - 16) * sizeof(q31_t *)
315 };
316
317 n2 = fftLen;
318 n1 = n2;
319 n2 >>= 2u;
320 for (int k = fftLen / 4; k > 1; k >>= 2)
321 {
322 for (int i = 0; i < iter; i++)
323 {
324 float32_t const *p_rearranged_twiddle_tab_stride1 =
325 &S->rearranged_twiddle_stride1[
326 S->rearranged_twiddle_tab_stride1_arr[stage]];
327 float32_t const *p_rearranged_twiddle_tab_stride2 =
328 &S->rearranged_twiddle_stride2[
329 S->rearranged_twiddle_tab_stride2_arr[stage]];
330 float32_t const *p_rearranged_twiddle_tab_stride3 =
331 &S->rearranged_twiddle_stride3[
332 S->rearranged_twiddle_tab_stride3_arr[stage]];
333 float32_t const *pW1, *pW2, *pW3;
334 float32_t *inA = pSrc + CMPLX_DIM * i * n1;
335 float32_t *inB = inA + n2 * CMPLX_DIM;
336 float32_t *inC = inB + n2 * CMPLX_DIM;
337 float32_t *inD = inC + n2 * CMPLX_DIM;
338
339 pW1 = p_rearranged_twiddle_tab_stride1;
340 pW2 = p_rearranged_twiddle_tab_stride2;
341 pW3 = p_rearranged_twiddle_tab_stride3;
342
343 blkCnt = n2 / 2;
344 /*
345 * load 2 f32 complex pair
346 */
347 vecA = vldrwq_f32(inA);
348 vecC = vldrwq_f32(inC);
349 while (blkCnt > 0U)
350 {
351 vecB = vldrwq_f32(inB);
352 vecD = vldrwq_f32(inD);
353
354 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
355 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
356
357 vecSum1 = vecB + vecD;
358 vecDiff1 = vecB - vecD;
359 /*
360 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
361 */
362 vecTmp0 = vecSum0 + vecSum1;
363 vst1q(inA, vecTmp0);
364 inA += 4;
365 /*
366 * [ 1 -1 1 -1 ] * [ A B C D ]'
367 */
368 vecTmp0 = vecSum0 - vecSum1;
369 /*
370 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
371 */
372 vecW = vld1q(pW2);
373 pW2 += 4;
374 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
375 vst1q(inB, vecTmp1);
376 inB += 4;
377
378 /*
379 * [ 1 -i -1 +i ] * [ A B C D ]'
380 */
381 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
382 /*
383 * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
384 */
385 vecW = vld1q(pW1);
386 pW1 += 4;
387 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
388 vst1q(inC, vecTmp1);
389 inC += 4;
390
391 /*
392 * [ 1 +i -1 -i ] * [ A B C D ]'
393 */
394 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
395 /*
396 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
397 */
398 vecW = vld1q(pW3);
399 pW3 += 4;
400 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
401 vst1q(inD, vecTmp1);
402 inD += 4;
403
404 vecA = vldrwq_f32(inA);
405 vecC = vldrwq_f32(inC);
406
407 blkCnt--;
408 }
409 }
410 n1 = n2;
411 n2 >>= 2u;
412 iter = iter << 2;
413 stage++;
414 }
415
416 /*
417 * start of Last stage process
418 */
419 uint32x4_t vecScGathAddr = vld1q_u32 (strides);
420 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
421
422 /*
423 * load scheduling
424 */
425 vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
426 vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
427
428 blkCnt = (fftLen >> 3);
429 while (blkCnt > 0U)
430 {
431 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
432 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
433
434 vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
435 vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
436
437 vecSum1 = vecB + vecD;
438 vecDiff1 = vecB - vecD;
439
440 vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
441 vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
442
443 vecTmp0 = vecSum0 + vecSum1;
444 vecTmp0 = vecTmp0 * onebyfftLen;
445 vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
446
447 vecTmp0 = vecSum0 - vecSum1;
448 vecTmp0 = vecTmp0 * onebyfftLen;
449 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
450
451 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
452 vecTmp0 = vecTmp0 * onebyfftLen;
453 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
454
455 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
456 vecTmp0 = vecTmp0 * onebyfftLen;
457 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
458
459 blkCnt--;
460 }
461
462 /*
463 * End of last stage process
464 */
465 }
466
arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)467 static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t *pSrc, uint32_t fftLen)
468 {
469 float32_t const *pCoefVec;
470 float32_t const *pCoef = S->pTwiddle;
471 float32_t *pIn0, *pIn1;
472 uint32_t n2;
473 float32_t onebyfftLen = arm_inverse_fft_length_f32(fftLen);
474 uint32_t blkCnt;
475 f32x4_t vecIn0, vecIn1, vecSum, vecDiff;
476 f32x4_t vecCmplxTmp, vecTw;
477
478
479 n2 = fftLen >> 1;
480 pIn0 = pSrc;
481 pIn1 = pSrc + fftLen;
482 pCoefVec = pCoef;
483
484 blkCnt = n2 / 2;
485 while (blkCnt > 0U)
486 {
487 vecIn0 = *(f32x4_t *) pIn0;
488 vecIn1 = *(f32x4_t *) pIn1;
489 vecTw = vld1q(pCoefVec);
490 pCoefVec += 4;
491
492 vecSum = vecIn0 + vecIn1;
493 vecDiff = vecIn0 - vecIn1;
494
495 vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
496
497 vst1q(pIn0, vecSum);
498 pIn0 += 4;
499 vst1q(pIn1, vecCmplxTmp);
500 pIn1 += 4;
501
502 blkCnt--;
503 }
504
505 _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, n2, onebyfftLen);
506
507 _arm_radix4_butterfly_inverse_f32_mve(S, pSrc + fftLen, n2, onebyfftLen);
508 }
509
510
511 /**
512 @addtogroup ComplexFFT
513 @{
514 */
515
516 /**
517 @brief Processing function for the floating-point complex FFT.
518 @param[in] S points to an instance of the floating-point CFFT structure
519 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
520 @param[in] ifftFlag flag that selects transform direction
521 - value = 0: forward transform
522 - value = 1: inverse transform
523 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
524 - value = 0: disables bit reversal of output
525 - value = 1: enables bit reversal of output
526 @return none
527 */
528
529
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)530 void arm_cfft_f32(
531 const arm_cfft_instance_f32 * S,
532 float32_t * pSrc,
533 uint8_t ifftFlag,
534 uint8_t bitReverseFlag)
535 {
536 uint32_t fftLen = S->fftLen;
537
538 if (ifftFlag == 1U) {
539
540 switch (fftLen) {
541 case 16:
542 case 64:
543 case 256:
544 case 1024:
545 case 4096:
546 _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
547 break;
548
549 case 32:
550 case 128:
551 case 512:
552 case 2048:
553 arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
554 break;
555 }
556 } else {
557 switch (fftLen) {
558 case 16:
559 case 64:
560 case 256:
561 case 1024:
562 case 4096:
563 _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
564 break;
565
566 case 32:
567 case 128:
568 case 512:
569 case 2048:
570 arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
571 break;
572 }
573 }
574
575
576 if (bitReverseFlag)
577 {
578
579 arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
580
581 }
582 }
583
584
585 #else
586 extern void arm_radix8_butterfly_f32(
587 float32_t * pSrc,
588 uint16_t fftLen,
589 const float32_t * pCoef,
590 uint16_t twidCoefModifier);
591
592 extern void arm_bitreversal_32(
593 uint32_t * pSrc,
594 const uint16_t bitRevLen,
595 const uint16_t * pBitRevTable);
596
597 /**
598 @ingroup groupTransforms
599 */
600
601 /**
602 @defgroup ComplexFFT Complex FFT Functions
603
604 @par
605 The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
606 Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
607 than the DFT, especially for long lengths.
608 The algorithms described in this section
609 operate on complex data. A separate set of functions is devoted to handling
610 of real sequences.
611 @par
612 There are separate algorithms for handling floating-point, Q15, and Q31 data
613 types. The algorithms available for each data type are described next.
614 @par
615 The FFT functions operate in-place. That is, the array holding the input data
616 will also be used to hold the corresponding result. The input data is complex
617 and contains <code>2*fftLen</code> interleaved values as shown below.
618 <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
619 The FFT result will be contained in the same array and the frequency domain
620 values will have the same interleaving.
621
622 @par Floating-point
623 The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
624 stages are performed along with a single radix-2 or radix-4 stage, as needed.
625 The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
626 a different twiddle factor table.
627 @par
628 The function uses the standard FFT definition and output values may grow by a
629 factor of <code>fftLen</code> when computing the forward transform. The
630 inverse transform includes a scale of <code>1/fftLen</code> as part of the
631 calculation and this matches the textbook definition of the inverse FFT.
632 @par
633 For the MVE version, the new arm_cfft_init_f32 initialization function is
634 <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
635 needed FFTs.</b> Other FFT versions can continue to be initialized as
636 explained below.
637 @par
638 For not MVE versions, pre-initialized data structures containing twiddle factors
639 and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include
640 this header in your function and then pass one of the constant structures as
641 an argument to arm_cfft_f32. For example:
642 @par
643 <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
644 @par
645 computes a 64-point inverse complex FFT including bit reversal.
646 The data structures are treated as constant data and not modified during the
647 calculation. The same data structure can be reused for multiple transforms
648 including mixing forward and inverse transforms.
649 @par
650 Earlier releases of the library provided separate radix-2 and radix-4
651 algorithms that operated on floating-point data. These functions are still
652 provided but are deprecated. The older functions are slower and less general
653 than the new functions.
654 @par
655 An example of initialization of the constants for the arm_cfft_f32 function follows:
656 @code
657 const static arm_cfft_instance_f32 *S;
658 ...
659 switch (length) {
660 case 16:
661 S = &arm_cfft_sR_f32_len16;
662 break;
663 case 32:
664 S = &arm_cfft_sR_f32_len32;
665 break;
666 case 64:
667 S = &arm_cfft_sR_f32_len64;
668 break;
669 case 128:
670 S = &arm_cfft_sR_f32_len128;
671 break;
672 case 256:
673 S = &arm_cfft_sR_f32_len256;
674 break;
675 case 512:
676 S = &arm_cfft_sR_f32_len512;
677 break;
678 case 1024:
679 S = &arm_cfft_sR_f32_len1024;
680 break;
681 case 2048:
682 S = &arm_cfft_sR_f32_len2048;
683 break;
684 case 4096:
685 S = &arm_cfft_sR_f32_len4096;
686 break;
687 }
688 @endcode
689 @par
690 The new arm_cfft_init_f32 can also be used.
691 @par Q15 and Q31
692 The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4
693 stages are performed along with a single radix-2 stage, as needed.
694 The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
695 a different twiddle factor table.
696 @par
697 The function uses the standard FFT definition and output values may grow by a
698 factor of <code>fftLen</code> when computing the forward transform. The
699 inverse transform includes a scale of <code>1/fftLen</code> as part of the
700 calculation and this matches the textbook definition of the inverse FFT.
701 @par
702 Pre-initialized data structures containing twiddle factors and bit reversal
703 tables are provided and defined in <code>arm_const_structs.h</code>. Include
704 this header in your function and then pass one of the constant structures as
705 an argument to arm_cfft_q31. For example:
706 @par
707 <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
708 @par
709 computes a 64-point inverse complex FFT including bit reversal.
710 The data structures are treated as constant data and not modified during the
711 calculation. The same data structure can be reused for multiple transforms
712 including mixing forward and inverse transforms.
713 @par
714 Earlier releases of the library provided separate radix-2 and radix-4
715 algorithms that operated on floating-point data. These functions are still
716 provided but are deprecated. The older functions are slower and less general
717 than the new functions.
718 @par
719 An example of initialization of the constants for the arm_cfft_q31 function follows:
720 @code
721 const static arm_cfft_instance_q31 *S;
722 ...
723 switch (length) {
724 case 16:
725 S = &arm_cfft_sR_q31_len16;
726 break;
727 case 32:
728 S = &arm_cfft_sR_q31_len32;
729 break;
730 case 64:
731 S = &arm_cfft_sR_q31_len64;
732 break;
733 case 128:
734 S = &arm_cfft_sR_q31_len128;
735 break;
736 case 256:
737 S = &arm_cfft_sR_q31_len256;
738 break;
739 case 512:
740 S = &arm_cfft_sR_q31_len512;
741 break;
742 case 1024:
743 S = &arm_cfft_sR_q31_len1024;
744 break;
745 case 2048:
746 S = &arm_cfft_sR_q31_len2048;
747 break;
748 case 4096:
749 S = &arm_cfft_sR_q31_len4096;
750 break;
751 }
752 @endcode
753
754 */
755
arm_cfft_radix8by2_f32(arm_cfft_instance_f32 * S,float32_t * p1)756 void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
757 {
758 uint32_t L = S->fftLen;
759 float32_t * pCol1, * pCol2, * pMid1, * pMid2;
760 float32_t * p2 = p1 + L;
761 const float32_t * tw = (float32_t *) S->pTwiddle;
762 float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
763 float32_t m0, m1, m2, m3;
764 uint32_t l;
765
766 pCol1 = p1;
767 pCol2 = p2;
768
769 /* Define new length */
770 L >>= 1;
771
772 /* Initialize mid pointers */
773 pMid1 = p1 + L;
774 pMid2 = p2 + L;
775
776 /* do two dot Fourier transform */
777 for (l = L >> 2; l > 0; l-- )
778 {
779 t1[0] = p1[0];
780 t1[1] = p1[1];
781 t1[2] = p1[2];
782 t1[3] = p1[3];
783
784 t2[0] = p2[0];
785 t2[1] = p2[1];
786 t2[2] = p2[2];
787 t2[3] = p2[3];
788
789 t3[0] = pMid1[0];
790 t3[1] = pMid1[1];
791 t3[2] = pMid1[2];
792 t3[3] = pMid1[3];
793
794 t4[0] = pMid2[0];
795 t4[1] = pMid2[1];
796 t4[2] = pMid2[2];
797 t4[3] = pMid2[3];
798
799 *p1++ = t1[0] + t2[0];
800 *p1++ = t1[1] + t2[1];
801 *p1++ = t1[2] + t2[2];
802 *p1++ = t1[3] + t2[3]; /* col 1 */
803
804 t2[0] = t1[0] - t2[0];
805 t2[1] = t1[1] - t2[1];
806 t2[2] = t1[2] - t2[2];
807 t2[3] = t1[3] - t2[3]; /* for col 2 */
808
809 *pMid1++ = t3[0] + t4[0];
810 *pMid1++ = t3[1] + t4[1];
811 *pMid1++ = t3[2] + t4[2];
812 *pMid1++ = t3[3] + t4[3]; /* col 1 */
813
814 t4[0] = t4[0] - t3[0];
815 t4[1] = t4[1] - t3[1];
816 t4[2] = t4[2] - t3[2];
817 t4[3] = t4[3] - t3[3]; /* for col 2 */
818
819 twR = *tw++;
820 twI = *tw++;
821
822 /* multiply by twiddle factors */
823 m0 = t2[0] * twR;
824 m1 = t2[1] * twI;
825 m2 = t2[1] * twR;
826 m3 = t2[0] * twI;
827
828 /* R = R * Tr - I * Ti */
829 *p2++ = m0 + m1;
830 /* I = I * Tr + R * Ti */
831 *p2++ = m2 - m3;
832
833 /* use vertical symmetry */
834 /* 0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
835 m0 = t4[0] * twI;
836 m1 = t4[1] * twR;
837 m2 = t4[1] * twI;
838 m3 = t4[0] * twR;
839
840 *pMid2++ = m0 - m1;
841 *pMid2++ = m2 + m3;
842
843 twR = *tw++;
844 twI = *tw++;
845
846 m0 = t2[2] * twR;
847 m1 = t2[3] * twI;
848 m2 = t2[3] * twR;
849 m3 = t2[2] * twI;
850
851 *p2++ = m0 + m1;
852 *p2++ = m2 - m3;
853
854 m0 = t4[2] * twI;
855 m1 = t4[3] * twR;
856 m2 = t4[3] * twI;
857 m3 = t4[2] * twR;
858
859 *pMid2++ = m0 - m1;
860 *pMid2++ = m2 + m3;
861 }
862
863 /* first col */
864 arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);
865
866 /* second col */
867 arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
868 }
869
arm_cfft_radix8by4_f32(arm_cfft_instance_f32 * S,float32_t * p1)870 void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
871 {
872 uint32_t L = S->fftLen >> 1;
873 float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
874 const float32_t *tw2, *tw3, *tw4;
875 float32_t * p2 = p1 + L;
876 float32_t * p3 = p2 + L;
877 float32_t * p4 = p3 + L;
878 float32_t t2[4], t3[4], t4[4], twR, twI;
879 float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
880 float32_t m0, m1, m2, m3;
881 uint32_t l, twMod2, twMod3, twMod4;
882
883 pCol1 = p1; /* points to real values by default */
884 pCol2 = p2;
885 pCol3 = p3;
886 pCol4 = p4;
887 pEnd1 = p2 - 1; /* points to imaginary values by default */
888 pEnd2 = p3 - 1;
889 pEnd3 = p4 - 1;
890 pEnd4 = pEnd3 + L;
891
892 tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;
893
894 L >>= 1;
895
896 /* do four dot Fourier transform */
897
898 twMod2 = 2;
899 twMod3 = 4;
900 twMod4 = 6;
901
902 /* TOP */
903 p1ap3_0 = p1[0] + p3[0];
904 p1sp3_0 = p1[0] - p3[0];
905 p1ap3_1 = p1[1] + p3[1];
906 p1sp3_1 = p1[1] - p3[1];
907
908 /* col 2 */
909 t2[0] = p1sp3_0 + p2[1] - p4[1];
910 t2[1] = p1sp3_1 - p2[0] + p4[0];
911 /* col 3 */
912 t3[0] = p1ap3_0 - p2[0] - p4[0];
913 t3[1] = p1ap3_1 - p2[1] - p4[1];
914 /* col 4 */
915 t4[0] = p1sp3_0 - p2[1] + p4[1];
916 t4[1] = p1sp3_1 + p2[0] - p4[0];
917 /* col 1 */
918 *p1++ = p1ap3_0 + p2[0] + p4[0];
919 *p1++ = p1ap3_1 + p2[1] + p4[1];
920
921 /* Twiddle factors are ones */
922 *p2++ = t2[0];
923 *p2++ = t2[1];
924 *p3++ = t3[0];
925 *p3++ = t3[1];
926 *p4++ = t4[0];
927 *p4++ = t4[1];
928
929 tw2 += twMod2;
930 tw3 += twMod3;
931 tw4 += twMod4;
932
933 for (l = (L - 2) >> 1; l > 0; l-- )
934 {
935 /* TOP */
936 p1ap3_0 = p1[0] + p3[0];
937 p1sp3_0 = p1[0] - p3[0];
938 p1ap3_1 = p1[1] + p3[1];
939 p1sp3_1 = p1[1] - p3[1];
940 /* col 2 */
941 t2[0] = p1sp3_0 + p2[1] - p4[1];
942 t2[1] = p1sp3_1 - p2[0] + p4[0];
943 /* col 3 */
944 t3[0] = p1ap3_0 - p2[0] - p4[0];
945 t3[1] = p1ap3_1 - p2[1] - p4[1];
946 /* col 4 */
947 t4[0] = p1sp3_0 - p2[1] + p4[1];
948 t4[1] = p1sp3_1 + p2[0] - p4[0];
949 /* col 1 - top */
950 *p1++ = p1ap3_0 + p2[0] + p4[0];
951 *p1++ = p1ap3_1 + p2[1] + p4[1];
952
953 /* BOTTOM */
954 p1ap3_1 = pEnd1[-1] + pEnd3[-1];
955 p1sp3_1 = pEnd1[-1] - pEnd3[-1];
956 p1ap3_0 = pEnd1[ 0] + pEnd3[0];
957 p1sp3_0 = pEnd1[ 0] - pEnd3[0];
958 /* col 2 */
959 t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
960 t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
961 /* col 3 */
962 t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
963 t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
964 /* col 4 */
965 t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
966 t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
967 /* col 1 - Bottom */
968 *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
969 *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
970
971 /* COL 2 */
972 /* read twiddle factors */
973 twR = *tw2++;
974 twI = *tw2++;
975 /* multiply by twiddle factors */
976 /* let Z1 = a + i(b), Z2 = c + i(d) */
977 /* => Z1 * Z2 = (a*c - b*d) + i(b*c + a*d) */
978
979 /* Top */
980 m0 = t2[0] * twR;
981 m1 = t2[1] * twI;
982 m2 = t2[1] * twR;
983 m3 = t2[0] * twI;
984
985 *p2++ = m0 + m1;
986 *p2++ = m2 - m3;
987 /* use vertical symmetry col 2 */
988 /* 0.9997 - 0.0245i <==> 0.0245 - 0.9997i */
989 /* Bottom */
990 m0 = t2[3] * twI;
991 m1 = t2[2] * twR;
992 m2 = t2[2] * twI;
993 m3 = t2[3] * twR;
994
995 *pEnd2-- = m0 - m1;
996 *pEnd2-- = m2 + m3;
997
998 /* COL 3 */
999 twR = tw3[0];
1000 twI = tw3[1];
1001 tw3 += twMod3;
1002 /* Top */
1003 m0 = t3[0] * twR;
1004 m1 = t3[1] * twI;
1005 m2 = t3[1] * twR;
1006 m3 = t3[0] * twI;
1007
1008 *p3++ = m0 + m1;
1009 *p3++ = m2 - m3;
1010 /* use vertical symmetry col 3 */
1011 /* 0.9988 - 0.0491i <==> -0.9988 - 0.0491i */
1012 /* Bottom */
1013 m0 = -t3[3] * twR;
1014 m1 = t3[2] * twI;
1015 m2 = t3[2] * twR;
1016 m3 = t3[3] * twI;
1017
1018 *pEnd3-- = m0 - m1;
1019 *pEnd3-- = m3 - m2;
1020
1021 /* COL 4 */
1022 twR = tw4[0];
1023 twI = tw4[1];
1024 tw4 += twMod4;
1025 /* Top */
1026 m0 = t4[0] * twR;
1027 m1 = t4[1] * twI;
1028 m2 = t4[1] * twR;
1029 m3 = t4[0] * twI;
1030
1031 *p4++ = m0 + m1;
1032 *p4++ = m2 - m3;
1033 /* use vertical symmetry col 4 */
1034 /* 0.9973 - 0.0736i <==> -0.0736 + 0.9973i */
1035 /* Bottom */
1036 m0 = t4[3] * twI;
1037 m1 = t4[2] * twR;
1038 m2 = t4[2] * twI;
1039 m3 = t4[3] * twR;
1040
1041 *pEnd4-- = m0 - m1;
1042 *pEnd4-- = m2 + m3;
1043 }
1044
1045 /* MIDDLE */
1046 /* Twiddle factors are */
1047 /* 1.0000 0.7071-0.7071i -1.0000i -0.7071-0.7071i */
1048 p1ap3_0 = p1[0] + p3[0];
1049 p1sp3_0 = p1[0] - p3[0];
1050 p1ap3_1 = p1[1] + p3[1];
1051 p1sp3_1 = p1[1] - p3[1];
1052
1053 /* col 2 */
1054 t2[0] = p1sp3_0 + p2[1] - p4[1];
1055 t2[1] = p1sp3_1 - p2[0] + p4[0];
1056 /* col 3 */
1057 t3[0] = p1ap3_0 - p2[0] - p4[0];
1058 t3[1] = p1ap3_1 - p2[1] - p4[1];
1059 /* col 4 */
1060 t4[0] = p1sp3_0 - p2[1] + p4[1];
1061 t4[1] = p1sp3_1 + p2[0] - p4[0];
1062 /* col 1 - Top */
1063 *p1++ = p1ap3_0 + p2[0] + p4[0];
1064 *p1++ = p1ap3_1 + p2[1] + p4[1];
1065
1066 /* COL 2 */
1067 twR = tw2[0];
1068 twI = tw2[1];
1069
1070 m0 = t2[0] * twR;
1071 m1 = t2[1] * twI;
1072 m2 = t2[1] * twR;
1073 m3 = t2[0] * twI;
1074
1075 *p2++ = m0 + m1;
1076 *p2++ = m2 - m3;
1077 /* COL 3 */
1078 twR = tw3[0];
1079 twI = tw3[1];
1080
1081 m0 = t3[0] * twR;
1082 m1 = t3[1] * twI;
1083 m2 = t3[1] * twR;
1084 m3 = t3[0] * twI;
1085
1086 *p3++ = m0 + m1;
1087 *p3++ = m2 - m3;
1088 /* COL 4 */
1089 twR = tw4[0];
1090 twI = tw4[1];
1091
1092 m0 = t4[0] * twR;
1093 m1 = t4[1] * twI;
1094 m2 = t4[1] * twR;
1095 m3 = t4[0] * twI;
1096
1097 *p4++ = m0 + m1;
1098 *p4++ = m2 - m3;
1099
1100 /* first col */
1101 arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);
1102
1103 /* second col */
1104 arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);
1105
1106 /* third col */
1107 arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);
1108
1109 /* fourth col */
1110 arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
1111 }
1112
1113 /**
1114 @addtogroup ComplexFFT
1115 @{
1116 */
1117
1118 /**
1119 @brief Processing function for the floating-point complex FFT.
1120 @param[in] S points to an instance of the floating-point CFFT structure
1121 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
1122 @param[in] ifftFlag flag that selects transform direction
1123 - value = 0: forward transform
1124 - value = 1: inverse transform
1125 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
1126 - value = 0: disables bit reversal of output
1127 - value = 1: enables bit reversal of output
1128 @return none
1129 */
1130
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)1131 void arm_cfft_f32(
1132 const arm_cfft_instance_f32 * S,
1133 float32_t * p1,
1134 uint8_t ifftFlag,
1135 uint8_t bitReverseFlag)
1136 {
1137 uint32_t L = S->fftLen, l;
1138 float32_t invL, * pSrc;
1139
1140 if (ifftFlag == 1U)
1141 {
1142 /* Conjugate input data */
1143 pSrc = p1 + 1;
1144 for (l = 0; l < L; l++)
1145 {
1146 *pSrc = -*pSrc;
1147 pSrc += 2;
1148 }
1149 }
1150
1151 switch (L)
1152 {
1153 case 16:
1154 case 128:
1155 case 1024:
1156 arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
1157 break;
1158 case 32:
1159 case 256:
1160 case 2048:
1161 arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
1162 break;
1163 case 64:
1164 case 512:
1165 case 4096:
1166 arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1);
1167 break;
1168 }
1169
1170 if ( bitReverseFlag )
1171 arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
1172
1173 if (ifftFlag == 1U)
1174 {
1175 invL = 1.0f / (float32_t)L;
1176
1177 /* Conjugate and scale output data */
1178 pSrc = p1;
1179 for (l= 0; l < L; l++)
1180 {
1181 *pSrc++ *= invL ;
1182 *pSrc = -(*pSrc) * invL;
1183 pSrc++;
1184 }
1185 }
1186 }
1187 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1188
1189 /**
1190 @} end of ComplexFFT group
1191 */
1192