1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cfft_f32.c
4 * Description: Combined Radix Decimation in Frequency CFFT Floating point processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions_f16.h"
30 #include "arm_common_tables_f16.h"
31
32
33 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
34
35 #include "arm_helium_utils.h"
36 #include "arm_vec_fft.h"
37 #include "arm_mve_tables_f16.h"
38
39
arm_inverse_fft_length_f16(uint16_t fftLen)40 static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
41 {
42 float16_t retValue=1.0;
43
44 switch (fftLen)
45 {
46
47 case 4096U:
48 retValue = (float16_t)0.000244140625f;
49 break;
50
51 case 2048U:
52 retValue = (float16_t)0.00048828125f;
53 break;
54
55 case 1024U:
56 retValue = (float16_t)0.0009765625f;
57 break;
58
59 case 512U:
60 retValue = (float16_t)0.001953125f;
61 break;
62
63 case 256U:
64 retValue = (float16_t)0.00390625f;
65 break;
66
67 case 128U:
68 retValue = (float16_t)0.0078125f;
69 break;
70
71 case 64U:
72 retValue = (float16_t)0.015625f;
73 break;
74
75 case 32U:
76 retValue = (float16_t)0.03125f;
77 break;
78
79 case 16U:
80 retValue = (float16_t)0.0625f;
81 break;
82
83
84 default:
85 break;
86 }
87 return(retValue);
88 }
89
90
_arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen)91 static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen)
92 {
93 f16x8_t vecTmp0, vecTmp1;
94 f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
95 f16x8_t vecA, vecB, vecC, vecD;
96 uint32_t blkCnt;
97 uint32_t n1, n2;
98 uint32_t stage = 0;
99 int32_t iter = 1;
100 static const int32_t strides[4] =
101 { ( 0 - 16) * (int32_t)sizeof(float16_t *)
102 , ( 4 - 16) * (int32_t)sizeof(float16_t *)
103 , ( 8 - 16) * (int32_t)sizeof(float16_t *)
104 , (12 - 16) * (int32_t)sizeof(float16_t *)};
105
106 n2 = fftLen;
107 n1 = n2;
108 n2 >>= 2u;
109 for (int k = fftLen / 4u; k > 1; k >>= 2)
110 {
111 float16_t const *p_rearranged_twiddle_tab_stride1 =
112 &S->rearranged_twiddle_stride1[
113 S->rearranged_twiddle_tab_stride1_arr[stage]];
114 float16_t const *p_rearranged_twiddle_tab_stride2 =
115 &S->rearranged_twiddle_stride2[
116 S->rearranged_twiddle_tab_stride2_arr[stage]];
117 float16_t const *p_rearranged_twiddle_tab_stride3 =
118 &S->rearranged_twiddle_stride3[
119 S->rearranged_twiddle_tab_stride3_arr[stage]];
120 float16_t * pBase = pSrc;
121 for (int i = 0; i < iter; i++)
122 {
123 float16_t *inA = pBase;
124 float16_t *inB = inA + n2 * CMPLX_DIM;
125 float16_t *inC = inB + n2 * CMPLX_DIM;
126 float16_t *inD = inC + n2 * CMPLX_DIM;
127 float16_t const *pW1 = p_rearranged_twiddle_tab_stride1;
128 float16_t const *pW2 = p_rearranged_twiddle_tab_stride2;
129 float16_t const *pW3 = p_rearranged_twiddle_tab_stride3;
130 f16x8_t vecW;
131
132 blkCnt = n2 / 4;
133 /*
134 * load 2 f16 complex pair
135 */
136 vecA = vldrhq_f16(inA);
137 vecC = vldrhq_f16(inC);
138 while (blkCnt > 0U)
139 {
140 vecB = vldrhq_f16(inB);
141 vecD = vldrhq_f16(inD);
142
143 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
144 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
145
146 vecSum1 = vecB + vecD;
147 vecDiff1 = vecB - vecD;
148 /*
149 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
150 */
151 vecTmp0 = vecSum0 + vecSum1;
152 vst1q(inA, vecTmp0);
153 inA += 8;
154
155 /*
156 * [ 1 -1 1 -1 ] * [ A B C D ]'
157 */
158 vecTmp0 = vecSum0 - vecSum1;
159 /*
160 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
161 */
162 vecW = vld1q(pW2);
163 pW2 += 8;
164 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
165 vst1q(inB, vecTmp1);
166 inB += 8;
167
168 /*
169 * [ 1 -i -1 +i ] * [ A B C D ]'
170 */
171 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
172 /*
173 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
174 */
175 vecW = vld1q(pW1);
176 pW1 +=8;
177 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
178 vst1q(inC, vecTmp1);
179 inC += 8;
180
181 /*
182 * [ 1 +i -1 -i ] * [ A B C D ]'
183 */
184 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
185 /*
186 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
187 */
188 vecW = vld1q(pW3);
189 pW3 += 8;
190 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
191 vst1q(inD, vecTmp1);
192 inD += 8;
193
194 vecA = vldrhq_f16(inA);
195 vecC = vldrhq_f16(inC);
196
197 blkCnt--;
198 }
199 pBase += CMPLX_DIM * n1;
200 }
201 n1 = n2;
202 n2 >>= 2u;
203 iter = iter << 2;
204 stage++;
205 }
206
207 /*
208 * start of Last stage process
209 */
210 uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
211 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
212
213 /* load scheduling */
214 vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
215 vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
216
217 blkCnt = (fftLen >> 4);
218 while (blkCnt > 0U)
219 {
220 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
221 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
222
223 vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
224 vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
225
226 vecSum1 = vecB + vecD;
227 vecDiff1 = vecB - vecD;
228
229 /* pre-load for next iteration */
230 vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
231 vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
232
233 vecTmp0 = vecSum0 + vecSum1;
234 vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
235
236 vecTmp0 = vecSum0 - vecSum1;
237 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
238
239 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
240 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
241
242 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
243 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
244
245 blkCnt--;
246 }
247
248 /*
249 * End of last stage process
250 */
251 }
252
arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen)253 static void arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S, float16_t *pSrc, uint32_t fftLen)
254 {
255 float16_t const *pCoefVec;
256 float16_t const *pCoef = S->pTwiddle;
257 float16_t *pIn0, *pIn1;
258 uint32_t n2;
259 uint32_t blkCnt;
260 f16x8_t vecIn0, vecIn1, vecSum, vecDiff;
261 f16x8_t vecCmplxTmp, vecTw;
262
263
264 n2 = fftLen >> 1;
265 pIn0 = pSrc;
266 pIn1 = pSrc + fftLen;
267 pCoefVec = pCoef;
268
269 blkCnt = n2 / 4;
270 while (blkCnt > 0U)
271 {
272 vecIn0 = *(f16x8_t *) pIn0;
273 vecIn1 = *(f16x8_t *) pIn1;
274 vecTw = vld1q(pCoefVec);
275 pCoefVec += 8;
276
277 vecSum = vaddq(vecIn0, vecIn1);
278 vecDiff = vsubq(vecIn0, vecIn1);
279
280 vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
281
282 vst1q(pIn0, vecSum);
283 pIn0 += 8;
284 vst1q(pIn1, vecCmplxTmp);
285 pIn1 += 8;
286
287 blkCnt--;
288 }
289
290 _arm_radix4_butterfly_f16_mve(S, pSrc, n2);
291
292 _arm_radix4_butterfly_f16_mve(S, pSrc + fftLen, n2);
293
294 pIn0 = pSrc;
295 }
296
_arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen,float16_t onebyfftLen)297 static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen, float16_t onebyfftLen)
298 {
299 f16x8_t vecTmp0, vecTmp1;
300 f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
301 f16x8_t vecA, vecB, vecC, vecD;
302 uint32_t blkCnt;
303 uint32_t n1, n2;
304 uint32_t stage = 0;
305 int32_t iter = 1;
306 static const int32_t strides[4] = {
307 ( 0 - 16) * (int32_t)sizeof(q31_t *),
308 ( 4 - 16) * (int32_t)sizeof(q31_t *),
309 ( 8 - 16) * (int32_t)sizeof(q31_t *),
310 (12 - 16) * (int32_t)sizeof(q31_t *)
311 };
312
313 n2 = fftLen;
314 n1 = n2;
315 n2 >>= 2u;
316 for (int k = fftLen / 4; k > 1; k >>= 2)
317 {
318 float16_t const *p_rearranged_twiddle_tab_stride1 =
319 &S->rearranged_twiddle_stride1[
320 S->rearranged_twiddle_tab_stride1_arr[stage]];
321 float16_t const *p_rearranged_twiddle_tab_stride2 =
322 &S->rearranged_twiddle_stride2[
323 S->rearranged_twiddle_tab_stride2_arr[stage]];
324 float16_t const *p_rearranged_twiddle_tab_stride3 =
325 &S->rearranged_twiddle_stride3[
326 S->rearranged_twiddle_tab_stride3_arr[stage]];
327
328 float16_t * pBase = pSrc;
329 for (int i = 0; i < iter; i++)
330 {
331 float16_t *inA = pBase;
332 float16_t *inB = inA + n2 * CMPLX_DIM;
333 float16_t *inC = inB + n2 * CMPLX_DIM;
334 float16_t *inD = inC + n2 * CMPLX_DIM;
335 float16_t const *pW1 = p_rearranged_twiddle_tab_stride1;
336 float16_t const *pW2 = p_rearranged_twiddle_tab_stride2;
337 float16_t const *pW3 = p_rearranged_twiddle_tab_stride3;
338 f16x8_t vecW;
339
340 blkCnt = n2 / 4;
341 /*
342 * load 2 f32 complex pair
343 */
344 vecA = vldrhq_f16(inA);
345 vecC = vldrhq_f16(inC);
346 while (blkCnt > 0U)
347 {
348 vecB = vldrhq_f16(inB);
349 vecD = vldrhq_f16(inD);
350
351 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
352 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
353
354 vecSum1 = vecB + vecD;
355 vecDiff1 = vecB - vecD;
356 /*
357 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
358 */
359 vecTmp0 = vecSum0 + vecSum1;
360 vst1q(inA, vecTmp0);
361 inA += 8;
362 /*
363 * [ 1 -1 1 -1 ] * [ A B C D ]'
364 */
365 vecTmp0 = vecSum0 - vecSum1;
366 /*
367 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
368 */
369 vecW = vld1q(pW2);
370 pW2 += 8;
371 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
372 vst1q(inB, vecTmp1);
373 inB += 8;
374
375 /*
376 * [ 1 -i -1 +i ] * [ A B C D ]'
377 */
378 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
379 /*
380 * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
381 */
382 vecW = vld1q(pW1);
383 pW1 += 8;
384 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
385 vst1q(inC, vecTmp1);
386 inC += 8;
387
388 /*
389 * [ 1 +i -1 -i ] * [ A B C D ]'
390 */
391 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
392 /*
393 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
394 */
395 vecW = vld1q(pW3);
396 pW3 += 8;
397 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
398 vst1q(inD, vecTmp1);
399 inD += 8;
400
401 vecA = vldrhq_f16(inA);
402 vecC = vldrhq_f16(inC);
403
404 blkCnt--;
405 }
406 pBase += CMPLX_DIM * n1;
407 }
408 n1 = n2;
409 n2 >>= 2u;
410 iter = iter << 2;
411 stage++;
412 }
413
414 /*
415 * start of Last stage process
416 */
417 uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
418 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
419
420 /*
421 * load scheduling
422 */
423 vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
424 vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
425
426 blkCnt = (fftLen >> 4);
427 while (blkCnt > 0U)
428 {
429 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
430 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
431
432 vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
433 vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
434
435 vecSum1 = vecB + vecD;
436 vecDiff1 = vecB - vecD;
437
438 vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
439 vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
440
441 vecTmp0 = vecSum0 + vecSum1;
442 vecTmp0 = vecTmp0 * onebyfftLen;
443 vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
444
445 vecTmp0 = vecSum0 - vecSum1;
446 vecTmp0 = vecTmp0 * onebyfftLen;
447 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
448
449 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
450 vecTmp0 = vecTmp0 * onebyfftLen;
451 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
452
453 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
454 vecTmp0 = vecTmp0 * onebyfftLen;
455 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
456
457 blkCnt--;
458 }
459
460 /*
461 * End of last stage process
462 */
463 }
464
arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint32_t fftLen)465 static void arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t *pSrc, uint32_t fftLen)
466 {
467 float16_t const *pCoefVec;
468 float16_t const *pCoef = S->pTwiddle;
469 float16_t *pIn0, *pIn1;
470 uint32_t n2;
471 float16_t onebyfftLen = arm_inverse_fft_length_f16(fftLen);
472 uint32_t blkCnt;
473 f16x8_t vecIn0, vecIn1, vecSum, vecDiff;
474 f16x8_t vecCmplxTmp, vecTw;
475
476
477 n2 = fftLen >> 1;
478 pIn0 = pSrc;
479 pIn1 = pSrc + fftLen;
480 pCoefVec = pCoef;
481
482 blkCnt = n2 / 4;
483 while (blkCnt > 0U)
484 {
485 vecIn0 = *(f16x8_t *) pIn0;
486 vecIn1 = *(f16x8_t *) pIn1;
487 vecTw = vld1q(pCoefVec);
488 pCoefVec += 8;
489
490 vecSum = vaddq(vecIn0, vecIn1);
491 vecDiff = vsubq(vecIn0, vecIn1);
492
493 vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
494
495 vst1q(pIn0, vecSum);
496 pIn0 += 8;
497 vst1q(pIn1, vecCmplxTmp);
498 pIn1 += 8;
499
500 blkCnt--;
501 }
502
503 _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, n2, onebyfftLen);
504
505 _arm_radix4_butterfly_inverse_f16_mve(S, pSrc + fftLen, n2, onebyfftLen);
506 }
507
508
509 /**
510 @addtogroup ComplexFFTF16
511 @{
512 */
513
514 /**
515 @brief Processing function for the floating-point complex FFT.
516 @param[in] S points to an instance of the floating-point CFFT structure
517 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
518 @param[in] ifftFlag flag that selects transform direction
519 - value = 0: forward transform
520 - value = 1: inverse transform
521 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
522 - value = 0: disables bit reversal of output
523 - value = 1: enables bit reversal of output
524 */
525
526
arm_cfft_f16(const arm_cfft_instance_f16 * S,float16_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)527 ARM_DSP_ATTRIBUTE void arm_cfft_f16(
528 const arm_cfft_instance_f16 * S,
529 float16_t * pSrc,
530 uint8_t ifftFlag,
531 uint8_t bitReverseFlag)
532 {
533 uint32_t fftLen = S->fftLen;
534
535 if (ifftFlag == 1U) {
536
537 switch (fftLen) {
538 case 16:
539 case 64:
540 case 256:
541 case 1024:
542 case 4096:
543 _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
544 break;
545
546 case 32:
547 case 128:
548 case 512:
549 case 2048:
550 arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
551 break;
552 }
553 } else {
554 switch (fftLen) {
555 case 16:
556 case 64:
557 case 256:
558 case 1024:
559 case 4096:
560 _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
561 break;
562
563 case 32:
564 case 128:
565 case 512:
566 case 2048:
567 arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
568 break;
569 }
570 }
571
572
573 if (bitReverseFlag)
574 {
575
576 arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
577
578 }
579 }
580
581 #else
582
583 #if defined(ARM_FLOAT16_SUPPORTED)
584
585 extern void arm_bitreversal_16(
586 uint16_t * pSrc,
587 const uint16_t bitRevLen,
588 const uint16_t * pBitRevTable);
589
590
591 extern void arm_cfft_radix4by2_f16(
592 float16_t * pSrc,
593 uint32_t fftLen,
594 const float16_t * pCoef);
595
596 extern void arm_radix4_butterfly_f16(
597 float16_t * pSrc,
598 uint16_t fftLen,
599 const float16_t * pCoef,
600 uint16_t twidCoefModifier);
601
602 /**
603 @addtogroup ComplexFFTF16
604 @{
605 */
606
607 /**
608 @brief Processing function for the floating-point complex FFT.
609 @param[in] S points to an instance of the floating-point CFFT structure
610 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
611 @param[in] ifftFlag flag that selects transform direction
612 - value = 0: forward transform
613 - value = 1: inverse transform
614 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
615 - value = 0: disables bit reversal of output
616 - value = 1: enables bit reversal of output
617 */
618
arm_cfft_f16(const arm_cfft_instance_f16 * S,float16_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)619 ARM_DSP_ATTRIBUTE void arm_cfft_f16(
620 const arm_cfft_instance_f16 * S,
621 float16_t * p1,
622 uint8_t ifftFlag,
623 uint8_t bitReverseFlag)
624 {
625 uint32_t L = S->fftLen, l;
626 float16_t invL, * pSrc;
627
628 if (ifftFlag == 1U)
629 {
630 /* Conjugate input data */
631 pSrc = p1 + 1;
632 for(l=0; l<L; l++)
633 {
634 *pSrc = -(_Float16)*pSrc;
635 pSrc += 2;
636 }
637 }
638
639 switch (L)
640 {
641
642 case 16:
643 case 64:
644 case 256:
645 case 1024:
646 case 4096:
647 arm_radix4_butterfly_f16 (p1, L, (float16_t*)S->pTwiddle, 1U);
648 break;
649
650 case 32:
651 case 128:
652 case 512:
653 case 2048:
654 arm_cfft_radix4by2_f16 ( p1, L, (float16_t*)S->pTwiddle);
655 break;
656
657 }
658
659 if ( bitReverseFlag )
660 arm_bitreversal_16((uint16_t*)p1, S->bitRevLength,(uint16_t*)S->pBitRevTable);
661
662 if (ifftFlag == 1U)
663 {
664 invL = 1.0f16/(_Float16)L;
665 /* Conjugate and scale output data */
666 pSrc = p1;
667 for(l=0; l<L; l++)
668 {
669 *pSrc++ *= (_Float16)invL ;
670 *pSrc = -(_Float16)(*pSrc) * (_Float16)invL;
671 pSrc++;
672 }
673 }
674 }
675 #endif /* if defined(ARM_FLOAT16_SUPPORTED) */
676 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
677
678 /**
679 @} end of ComplexFFTF16 group
680 */
681