1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cfft_f32.c
4 * Description: Combined Radix Decimation in Frequency CFFT Floating point processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions_f16.h"
30 #include "arm_common_tables_f16.h"
31
32
33 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
34
35 #include "arm_helium_utils.h"
36 #include "arm_vec_fft.h"
37 #include "arm_mve_tables_f16.h"
38
39
arm_inverse_fft_length_f16(uint16_t fftLen)40 static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
41 {
42 float16_t retValue=1.0;
43
44 switch (fftLen)
45 {
46
47 case 4096U:
48 retValue = (float16_t)0.000244140625f;
49 break;
50
51 case 2048U:
52 retValue = (float16_t)0.00048828125f;
53 break;
54
55 case 1024U:
56 retValue = (float16_t)0.0009765625f;
57 break;
58
59 case 512U:
60 retValue = (float16_t)0.001953125f;
61 break;
62
63 case 256U:
64 retValue = (float16_t)0.00390625f;
65 break;
66
67 case 128U:
68 retValue = (float16_t)0.0078125f;
69 break;
70
71 case 64U:
72 retValue = (float16_t)0.015625f;
73 break;
74
75 case 32U:
76 retValue = (float16_t)0.03125f;
77 break;
78
79 case 16U:
80 retValue = (float16_t)0.0625f;
81 break;
82
83
84 default:
85 break;
86 }
87 return(retValue);
88 }
89
90
_arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen)91 static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen)
92 {
93 f16x8_t vecTmp0, vecTmp1;
94 f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
95 f16x8_t vecA, vecB, vecC, vecD;
96 uint32_t blkCnt;
97 uint32_t n1, n2;
98 uint32_t stage = 0;
99 int32_t iter = 1;
100 static const uint32_t strides[4] =
101 {(0 - 16) * sizeof(float16_t *)
102 , (4 - 16) * sizeof(float16_t *)
103 , (8 - 16) * sizeof(float16_t *)
104 , (12 - 16) * sizeof(float16_t *)};
105
106 n2 = fftLen;
107 n1 = n2;
108 n2 >>= 2u;
109 for (int k = fftLen / 4u; k > 1; k >>= 2)
110 {
111 for (int i = 0; i < iter; i++)
112 {
113 float16_t const *p_rearranged_twiddle_tab_stride1 =
114 &S->rearranged_twiddle_stride1[
115 S->rearranged_twiddle_tab_stride1_arr[stage]];
116 float16_t const *p_rearranged_twiddle_tab_stride2 =
117 &S->rearranged_twiddle_stride2[
118 S->rearranged_twiddle_tab_stride2_arr[stage]];
119 float16_t const *p_rearranged_twiddle_tab_stride3 =
120 &S->rearranged_twiddle_stride3[
121 S->rearranged_twiddle_tab_stride3_arr[stage]];
122 float16_t const *pW1, *pW2, *pW3;
123 float16_t *inA = pSrc + CMPLX_DIM * i * n1;
124 float16_t *inB = inA + n2 * CMPLX_DIM;
125 float16_t *inC = inB + n2 * CMPLX_DIM;
126 float16_t *inD = inC + n2 * CMPLX_DIM;
127 f16x8_t vecW;
128
129
130 pW1 = p_rearranged_twiddle_tab_stride1;
131 pW2 = p_rearranged_twiddle_tab_stride2;
132 pW3 = p_rearranged_twiddle_tab_stride3;
133
134 blkCnt = n2 / 4;
135 /*
136 * load 2 f16 complex pair
137 */
138 vecA = vldrhq_f16(inA);
139 vecC = vldrhq_f16(inC);
140 while (blkCnt > 0U)
141 {
142 vecB = vldrhq_f16(inB);
143 vecD = vldrhq_f16(inD);
144
145 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
146 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
147
148 vecSum1 = vecB + vecD;
149 vecDiff1 = vecB - vecD;
150 /*
151 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
152 */
153 vecTmp0 = vecSum0 + vecSum1;
154 vst1q(inA, vecTmp0);
155 inA += 8;
156
157 /*
158 * [ 1 -1 1 -1 ] * [ A B C D ]'
159 */
160 vecTmp0 = vecSum0 - vecSum1;
161 /*
162 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
163 */
164 vecW = vld1q(pW2);
165 pW2 += 8;
166 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
167 vst1q(inB, vecTmp1);
168 inB += 8;
169
170 /*
171 * [ 1 -i -1 +i ] * [ A B C D ]'
172 */
173 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
174 /*
175 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
176 */
177 vecW = vld1q(pW1);
178 pW1 +=8;
179 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
180 vst1q(inC, vecTmp1);
181 inC += 8;
182
183 /*
184 * [ 1 +i -1 -i ] * [ A B C D ]'
185 */
186 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
187 /*
188 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
189 */
190 vecW = vld1q(pW3);
191 pW3 += 8;
192 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
193 vst1q(inD, vecTmp1);
194 inD += 8;
195
196 vecA = vldrhq_f16(inA);
197 vecC = vldrhq_f16(inC);
198
199 blkCnt--;
200 }
201 }
202 n1 = n2;
203 n2 >>= 2u;
204 iter = iter << 2;
205 stage++;
206 }
207
208 /*
209 * start of Last stage process
210 */
211 uint32x4_t vecScGathAddr = vld1q_u32(strides);
212 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
213
214 /* load scheduling */
215 vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
216 vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
217
218 blkCnt = (fftLen >> 4);
219 while (blkCnt > 0U)
220 {
221 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
222 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
223
224 vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
225 vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
226
227 vecSum1 = vecB + vecD;
228 vecDiff1 = vecB - vecD;
229
230 /* pre-load for next iteration */
231 vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
232 vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
233
234 vecTmp0 = vecSum0 + vecSum1;
235 vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
236
237 vecTmp0 = vecSum0 - vecSum1;
238 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
239
240 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
241 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
242
243 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
244 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
245
246 blkCnt--;
247 }
248
249 /*
250 * End of last stage process
251 */
252 }
253
arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen)254 static void arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S, float16_t *pSrc, uint32_t fftLen)
255 {
256 float16_t const *pCoefVec;
257 float16_t const *pCoef = S->pTwiddle;
258 float16_t *pIn0, *pIn1;
259 uint32_t n2;
260 uint32_t blkCnt;
261 f16x8_t vecIn0, vecIn1, vecSum, vecDiff;
262 f16x8_t vecCmplxTmp, vecTw;
263
264
265 n2 = fftLen >> 1;
266 pIn0 = pSrc;
267 pIn1 = pSrc + fftLen;
268 pCoefVec = pCoef;
269
270 blkCnt = n2 / 4;
271 while (blkCnt > 0U)
272 {
273 vecIn0 = *(f16x8_t *) pIn0;
274 vecIn1 = *(f16x8_t *) pIn1;
275 vecTw = vld1q(pCoefVec);
276 pCoefVec += 8;
277
278 vecSum = vaddq(vecIn0, vecIn1);
279 vecDiff = vsubq(vecIn0, vecIn1);
280
281 vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
282
283 vst1q(pIn0, vecSum);
284 pIn0 += 8;
285 vst1q(pIn1, vecCmplxTmp);
286 pIn1 += 8;
287
288 blkCnt--;
289 }
290
291 _arm_radix4_butterfly_f16_mve(S, pSrc, n2);
292
293 _arm_radix4_butterfly_f16_mve(S, pSrc + fftLen, n2);
294
295 pIn0 = pSrc;
296 }
297
_arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen,float16_t onebyfftLen)298 static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen, float16_t onebyfftLen)
299 {
300 f16x8_t vecTmp0, vecTmp1;
301 f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
302 f16x8_t vecA, vecB, vecC, vecD;
303 f16x8_t vecW;
304 uint32_t blkCnt;
305 uint32_t n1, n2;
306 uint32_t stage = 0;
307 int32_t iter = 1;
308 static const uint32_t strides[4] = {
309 (0 - 16) * sizeof(q31_t *),
310 (4 - 16) * sizeof(q31_t *),
311 (8 - 16) * sizeof(q31_t *),
312 (12 - 16) * sizeof(q31_t *)
313 };
314
315 n2 = fftLen;
316 n1 = n2;
317 n2 >>= 2u;
318 for (int k = fftLen / 4; k > 1; k >>= 2)
319 {
320 for (int i = 0; i < iter; i++)
321 {
322 float16_t const *p_rearranged_twiddle_tab_stride1 =
323 &S->rearranged_twiddle_stride1[
324 S->rearranged_twiddle_tab_stride1_arr[stage]];
325 float16_t const *p_rearranged_twiddle_tab_stride2 =
326 &S->rearranged_twiddle_stride2[
327 S->rearranged_twiddle_tab_stride2_arr[stage]];
328 float16_t const *p_rearranged_twiddle_tab_stride3 =
329 &S->rearranged_twiddle_stride3[
330 S->rearranged_twiddle_tab_stride3_arr[stage]];
331 float16_t const *pW1, *pW2, *pW3;
332 float16_t *inA = pSrc + CMPLX_DIM * i * n1;
333 float16_t *inB = inA + n2 * CMPLX_DIM;
334 float16_t *inC = inB + n2 * CMPLX_DIM;
335 float16_t *inD = inC + n2 * CMPLX_DIM;
336
337 pW1 = p_rearranged_twiddle_tab_stride1;
338 pW2 = p_rearranged_twiddle_tab_stride2;
339 pW3 = p_rearranged_twiddle_tab_stride3;
340
341 blkCnt = n2 / 4;
342 /*
343 * load 2 f32 complex pair
344 */
345 vecA = vldrhq_f16(inA);
346 vecC = vldrhq_f16(inC);
347 while (blkCnt > 0U)
348 {
349 vecB = vldrhq_f16(inB);
350 vecD = vldrhq_f16(inD);
351
352 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
353 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
354
355 vecSum1 = vecB + vecD;
356 vecDiff1 = vecB - vecD;
357 /*
358 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
359 */
360 vecTmp0 = vecSum0 + vecSum1;
361 vst1q(inA, vecTmp0);
362 inA += 8;
363 /*
364 * [ 1 -1 1 -1 ] * [ A B C D ]'
365 */
366 vecTmp0 = vecSum0 - vecSum1;
367 /*
368 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
369 */
370 vecW = vld1q(pW2);
371 pW2 += 8;
372 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
373 vst1q(inB, vecTmp1);
374 inB += 8;
375
376 /*
377 * [ 1 -i -1 +i ] * [ A B C D ]'
378 */
379 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
380 /*
381 * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
382 */
383 vecW = vld1q(pW1);
384 pW1 += 8;
385 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
386 vst1q(inC, vecTmp1);
387 inC += 8;
388
389 /*
390 * [ 1 +i -1 -i ] * [ A B C D ]'
391 */
392 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
393 /*
394 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
395 */
396 vecW = vld1q(pW3);
397 pW3 += 8;
398 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
399 vst1q(inD, vecTmp1);
400 inD += 8;
401
402 vecA = vldrhq_f16(inA);
403 vecC = vldrhq_f16(inC);
404
405 blkCnt--;
406 }
407 }
408 n1 = n2;
409 n2 >>= 2u;
410 iter = iter << 2;
411 stage++;
412 }
413
414 /*
415 * start of Last stage process
416 */
417 uint32x4_t vecScGathAddr = vld1q_u32(strides);
418 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
419
420 /*
421 * load scheduling
422 */
423 vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
424 vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
425
426 blkCnt = (fftLen >> 4);
427 while (blkCnt > 0U)
428 {
429 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
430 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
431
432 vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
433 vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
434
435 vecSum1 = vecB + vecD;
436 vecDiff1 = vecB - vecD;
437
438 vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
439 vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
440
441 vecTmp0 = vecSum0 + vecSum1;
442 vecTmp0 = vecTmp0 * onebyfftLen;
443 vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
444
445 vecTmp0 = vecSum0 - vecSum1;
446 vecTmp0 = vecTmp0 * onebyfftLen;
447 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
448
449 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
450 vecTmp0 = vecTmp0 * onebyfftLen;
451 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
452
453 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
454 vecTmp0 = vecTmp0 * onebyfftLen;
455 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
456
457 blkCnt--;
458 }
459
460 /*
461 * End of last stage process
462 */
463 }
464
arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen)465 static void arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t *pSrc, uint32_t fftLen)
466 {
467 float16_t const *pCoefVec;
468 float16_t const *pCoef = S->pTwiddle;
469 float16_t *pIn0, *pIn1;
470 uint32_t n2;
471 float16_t onebyfftLen = arm_inverse_fft_length_f16(fftLen);
472 uint32_t blkCnt;
473 f16x8_t vecIn0, vecIn1, vecSum, vecDiff;
474 f16x8_t vecCmplxTmp, vecTw;
475
476
477 n2 = fftLen >> 1;
478 pIn0 = pSrc;
479 pIn1 = pSrc + fftLen;
480 pCoefVec = pCoef;
481
482 blkCnt = n2 / 4;
483 while (blkCnt > 0U)
484 {
485 vecIn0 = *(f16x8_t *) pIn0;
486 vecIn1 = *(f16x8_t *) pIn1;
487 vecTw = vld1q(pCoefVec);
488 pCoefVec += 8;
489
490 vecSum = vaddq(vecIn0, vecIn1);
491 vecDiff = vsubq(vecIn0, vecIn1);
492
493 vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
494
495 vst1q(pIn0, vecSum);
496 pIn0 += 8;
497 vst1q(pIn1, vecCmplxTmp);
498 pIn1 += 8;
499
500 blkCnt--;
501 }
502
503 _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, n2, onebyfftLen);
504
505 _arm_radix4_butterfly_inverse_f16_mve(S, pSrc + fftLen, n2, onebyfftLen);
506 }
507
508
509 /**
510 @addtogroup ComplexFFT
511 @{
512 */
513
514 /**
515 @brief Processing function for the floating-point complex FFT.
516 @param[in] S points to an instance of the floating-point CFFT structure
517 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
518 @param[in] ifftFlag flag that selects transform direction
519 - value = 0: forward transform
520 - value = 1: inverse transform
521 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
522 - value = 0: disables bit reversal of output
523 - value = 1: enables bit reversal of output
524 @return none
525 */
526
527
arm_cfft_f16(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)528 void arm_cfft_f16(
529 const arm_cfft_instance_f16 * S,
530 float16_t * pSrc,
531 uint8_t ifftFlag,
532 uint8_t bitReverseFlag)
533 {
534 uint32_t fftLen = S->fftLen;
535
536 if (ifftFlag == 1U) {
537
538 switch (fftLen) {
539 case 16:
540 case 64:
541 case 256:
542 case 1024:
543 case 4096:
544 _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
545 break;
546
547 case 32:
548 case 128:
549 case 512:
550 case 2048:
551 arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
552 break;
553 }
554 } else {
555 switch (fftLen) {
556 case 16:
557 case 64:
558 case 256:
559 case 1024:
560 case 4096:
561 _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
562 break;
563
564 case 32:
565 case 128:
566 case 512:
567 case 2048:
568 arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
569 break;
570 }
571 }
572
573
574 if (bitReverseFlag)
575 {
576
577 arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
578
579 }
580 }
581
582 #else
583
584 #if defined(ARM_FLOAT16_SUPPORTED)
585
586 extern void arm_bitreversal_16(
587 uint16_t * pSrc,
588 const uint16_t bitRevLen,
589 const uint16_t * pBitRevTable);
590
591
592 extern void arm_cfft_radix4by2_f16(
593 float16_t * pSrc,
594 uint32_t fftLen,
595 const float16_t * pCoef);
596
597 extern void arm_radix4_butterfly_f16(
598 float16_t * pSrc,
599 uint16_t fftLen,
600 const float16_t * pCoef,
601 uint16_t twidCoefModifier);
602
603 /**
604 @ingroup groupTransforms
605 */
606
607 /**
608 @defgroup ComplexFFT Complex FFT Functions
609
610 @par
611 The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
612 Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
613 than the DFT, especially for long lengths.
614 The algorithms described in this section
615 operate on complex data. A separate set of functions is devoted to handling
616 of real sequences.
617 @par
618 There are separate algorithms for handling floating-point, Q15, and Q31 data
619 types. The algorithms available for each data type are described next.
620 @par
621 The FFT functions operate in-place. That is, the array holding the input data
622 will also be used to hold the corresponding result. The input data is complex
623 and contains <code>2*fftLen</code> interleaved values as shown below.
624 <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
625 The FFT result will be contained in the same array and the frequency domain
626 values will have the same interleaving.
627
628 @par Floating-point
629 The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
630 stages are performed along with a single radix-2 or radix-4 stage, as needed.
631 The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
632 a different twiddle factor table.
633 @par
634 The function uses the standard FFT definition and output values may grow by a
635 factor of <code>fftLen</code> when computing the forward transform. The
636 inverse transform includes a scale of <code>1/fftLen</code> as part of the
637 calculation and this matches the textbook definition of the inverse FFT.
638 @par
639 For the MVE version, the new arm_cfft_init_f32 initialization function is
640 <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
641 needed FFTs.</b> Other FFT versions can continue to be initialized as
642 explained below.
643 @par
644 For not MVE versions, pre-initialized data structures containing twiddle factors
645 and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include
646 this header in your function and then pass one of the constant structures as
647 an argument to arm_cfft_f32. For example:
648 @par
649 <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
650 @par
651 computes a 64-point inverse complex FFT including bit reversal.
652 The data structures are treated as constant data and not modified during the
653 calculation. The same data structure can be reused for multiple transforms
654 including mixing forward and inverse transforms.
655 @par
656 Earlier releases of the library provided separate radix-2 and radix-4
657 algorithms that operated on floating-point data. These functions are still
658 provided but are deprecated. The older functions are slower and less general
659 than the new functions.
660 @par
661 An example of initialization of the constants for the arm_cfft_f32 function follows:
662 @code
663 const static arm_cfft_instance_f32 *S;
664 ...
665 switch (length) {
666 case 16:
667 S = &arm_cfft_sR_f32_len16;
668 break;
669 case 32:
670 S = &arm_cfft_sR_f32_len32;
671 break;
672 case 64:
673 S = &arm_cfft_sR_f32_len64;
674 break;
675 case 128:
676 S = &arm_cfft_sR_f32_len128;
677 break;
678 case 256:
679 S = &arm_cfft_sR_f32_len256;
680 break;
681 case 512:
682 S = &arm_cfft_sR_f32_len512;
683 break;
684 case 1024:
685 S = &arm_cfft_sR_f32_len1024;
686 break;
687 case 2048:
688 S = &arm_cfft_sR_f32_len2048;
689 break;
690 case 4096:
691 S = &arm_cfft_sR_f32_len4096;
692 break;
693 }
694 @endcode
695 @par
696 The new arm_cfft_init_f32 can also be used.
697 @par Q15 and Q31
698 The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4
699 stages are performed along with a single radix-2 stage, as needed.
700 The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
701 a different twiddle factor table.
702 @par
703 The function uses the standard FFT definition and output values may grow by a
704 factor of <code>fftLen</code> when computing the forward transform. The
705 inverse transform includes a scale of <code>1/fftLen</code> as part of the
706 calculation and this matches the textbook definition of the inverse FFT.
707 @par
708 Pre-initialized data structures containing twiddle factors and bit reversal
709 tables are provided and defined in <code>arm_const_structs.h</code>. Include
710 this header in your function and then pass one of the constant structures as
711 an argument to arm_cfft_q31. For example:
712 @par
713 <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
714 @par
715 computes a 64-point inverse complex FFT including bit reversal.
716 The data structures are treated as constant data and not modified during the
717 calculation. The same data structure can be reused for multiple transforms
718 including mixing forward and inverse transforms.
719 @par
720 Earlier releases of the library provided separate radix-2 and radix-4
721 algorithms that operated on floating-point data. These functions are still
722 provided but are deprecated. The older functions are slower and less general
723 than the new functions.
724 @par
725 An example of initialization of the constants for the arm_cfft_q31 function follows:
726 @code
727 const static arm_cfft_instance_q31 *S;
728 ...
729 switch (length) {
730 case 16:
731 S = &arm_cfft_sR_q31_len16;
732 break;
733 case 32:
734 S = &arm_cfft_sR_q31_len32;
735 break;
736 case 64:
737 S = &arm_cfft_sR_q31_len64;
738 break;
739 case 128:
740 S = &arm_cfft_sR_q31_len128;
741 break;
742 case 256:
743 S = &arm_cfft_sR_q31_len256;
744 break;
745 case 512:
746 S = &arm_cfft_sR_q31_len512;
747 break;
748 case 1024:
749 S = &arm_cfft_sR_q31_len1024;
750 break;
751 case 2048:
752 S = &arm_cfft_sR_q31_len2048;
753 break;
754 case 4096:
755 S = &arm_cfft_sR_q31_len4096;
756 break;
757 }
758 @endcode
759
760 */
761
762
763 /**
764 @addtogroup ComplexFFT
765 @{
766 */
767
768 /**
769 @brief Processing function for the floating-point complex FFT.
770 @param[in] S points to an instance of the floating-point CFFT structure
771 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
772 @param[in] ifftFlag flag that selects transform direction
773 - value = 0: forward transform
774 - value = 1: inverse transform
775 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
776 - value = 0: disables bit reversal of output
777 - value = 1: enables bit reversal of output
778 @return none
779 */
780
arm_cfft_f16(const arm_cfft_instance_f16 * S,float16_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)781 void arm_cfft_f16(
782 const arm_cfft_instance_f16 * S,
783 float16_t * p1,
784 uint8_t ifftFlag,
785 uint8_t bitReverseFlag)
786 {
787 uint32_t L = S->fftLen, l;
788 float16_t invL, * pSrc;
789
790 if (ifftFlag == 1U)
791 {
792 /* Conjugate input data */
793 pSrc = p1 + 1;
794 for(l=0; l<L; l++)
795 {
796 *pSrc = -*pSrc;
797 pSrc += 2;
798 }
799 }
800
801 switch (L)
802 {
803
804 case 16:
805 case 64:
806 case 256:
807 case 1024:
808 case 4096:
809 arm_radix4_butterfly_f16 (p1, L, (float16_t*)S->pTwiddle, 1U);
810 break;
811
812 case 32:
813 case 128:
814 case 512:
815 case 2048:
816 arm_cfft_radix4by2_f16 ( p1, L, (float16_t*)S->pTwiddle);
817 break;
818
819 }
820
821 if ( bitReverseFlag )
822 arm_bitreversal_16((uint16_t*)p1, S->bitRevLength,(uint16_t*)S->pBitRevTable);
823
824 if (ifftFlag == 1U)
825 {
826 invL = 1.0f/(float16_t)L;
827 /* Conjugate and scale output data */
828 pSrc = p1;
829 for(l=0; l<L; l++)
830 {
831 *pSrc++ *= invL ;
832 *pSrc = -(*pSrc) * invL;
833 pSrc++;
834 }
835 }
836 }
837 #endif /* if defined(ARM_FLOAT16_SUPPORTED) */
838 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
839
840 /**
841 @} end of ComplexFFT group
842 */
843