1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cfft_q31.c
4 * Description: Combined Radix Decimation in Frequency CFFT fixed point processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions.h"
30
31
32
33 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
34
35 #include "arm_vec_fft.h"
36
37
_arm_radix4_butterfly_q31_mve(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint32_t fftLen)38 static void _arm_radix4_butterfly_q31_mve(
39 const arm_cfft_instance_q31 * S,
40 q31_t *pSrc,
41 uint32_t fftLen)
42 {
43 q31x4_t vecTmp0, vecTmp1;
44 q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
45 q31x4_t vecA, vecB, vecC, vecD;
46 q31x4_t vecW;
47 uint32_t blkCnt;
48 uint32_t n1, n2;
49 uint32_t stage = 0;
50 int32_t iter = 1;
51 static const uint32_t strides[4] = {
52 (0 - 16) * sizeof(q31_t *), (1 - 16) * sizeof(q31_t *),
53 (8 - 16) * sizeof(q31_t *), (9 - 16) * sizeof(q31_t *)
54 };
55
56
57 /*
58 * Process first stages
59 * Each stage in middle stages provides two down scaling of the input
60 */
61 n2 = fftLen;
62 n1 = n2;
63 n2 >>= 2u;
64
65 for (int k = fftLen / 4u; k > 1; k >>= 2u)
66 {
67 for (int i = 0; i < iter; i++)
68 {
69 q31_t const *p_rearranged_twiddle_tab_stride2 =
70 &S->rearranged_twiddle_stride2[
71 S->rearranged_twiddle_tab_stride2_arr[stage]];
72 q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
73 S->rearranged_twiddle_tab_stride3_arr[stage]];
74 q31_t const *p_rearranged_twiddle_tab_stride1 =
75 &S->rearranged_twiddle_stride1[
76 S->rearranged_twiddle_tab_stride1_arr[stage]];
77 q31_t const *pW1, *pW2, *pW3;
78 q31_t *inA = pSrc + CMPLX_DIM * i * n1;
79 q31_t *inB = inA + n2 * CMPLX_DIM;
80 q31_t *inC = inB + n2 * CMPLX_DIM;
81 q31_t *inD = inC + n2 * CMPLX_DIM;
82
83 pW1 = p_rearranged_twiddle_tab_stride1;
84 pW2 = p_rearranged_twiddle_tab_stride2;
85 pW3 = p_rearranged_twiddle_tab_stride3;
86
87 blkCnt = n2 / 2;
88 /*
89 * load 2 x q31 complex pair
90 */
91 vecA = vldrwq_s32(inA);
92 vecC = vldrwq_s32(inC);
93 while (blkCnt > 0U)
94 {
95 vecB = vldrwq_s32(inB);
96 vecD = vldrwq_s32(inD);
97
98 vecSum0 = vhaddq(vecA, vecC);
99 vecDiff0 = vhsubq(vecA, vecC);
100
101 vecSum1 = vhaddq(vecB, vecD);
102 vecDiff1 = vhsubq(vecB, vecD);
103 /*
104 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
105 */
106 vecTmp0 = vhaddq(vecSum0, vecSum1);
107 vst1q(inA, vecTmp0);
108 inA += 4;
109 /*
110 * [ 1 -1 1 -1 ] * [ A B C D ]'
111 */
112 vecTmp0 = vhsubq(vecSum0, vecSum1);
113 /*
114 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
115 */
116 vecW = vld1q(pW2);
117 pW2 += 4;
118 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
119
120 vst1q(inB, vecTmp1);
121 inB += 4;
122 /*
123 * [ 1 -i -1 +i ] * [ A B C D ]'
124 */
125 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
126 /*
127 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
128 */
129 vecW = vld1q(pW1);
130 pW1 += 4;
131 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
132 vst1q(inC, vecTmp1);
133 inC += 4;
134 /*
135 * [ 1 +i -1 -i ] * [ A B C D ]'
136 */
137 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
138 /*
139 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
140 */
141 vecW = vld1q(pW3);
142 pW3 += 4;
143 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
144 vst1q(inD, vecTmp1);
145 inD += 4;
146
147 vecA = vldrwq_s32(inA);
148 vecC = vldrwq_s32(inC);
149
150 blkCnt--;
151 }
152 }
153 n1 = n2;
154 n2 >>= 2u;
155 iter = iter << 2;
156 stage++;
157 }
158
159 /*
160 * End of 1st stages process
161 * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages
162 * data is in 9.23(q23) format for the 256 point as there are 2 middle stages
163 * data is in 7.25(q25) format for the 64 point as there are 1 middle stage
164 * data is in 5.27(q27) format for the 16 point as there are no middle stages
165 */
166
167 /*
168 * start of Last stage process
169 */
170 uint32x4_t vecScGathAddr = vld1q_u32(strides);
171 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
172
173 /*
174 * load scheduling
175 */
176 vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
177 vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
178
179 blkCnt = (fftLen >> 3);
180 while (blkCnt > 0U)
181 {
182 vecSum0 = vhaddq(vecA, vecC);
183 vecDiff0 = vhsubq(vecA, vecC);
184
185 vecB = vldrwq_gather_base_s32(vecScGathAddr, 8);
186 vecD = vldrwq_gather_base_s32(vecScGathAddr, 24);
187
188 vecSum1 = vhaddq(vecB, vecD);
189 vecDiff1 = vhsubq(vecB, vecD);
190 /*
191 * pre-load for next iteration
192 */
193 vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
194 vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
195
196 vecTmp0 = vhaddq(vecSum0, vecSum1);
197 vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0);
198
199 vecTmp0 = vhsubq(vecSum0, vecSum1);
200 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0);
201
202 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
203 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0);
204
205 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
206 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0);
207
208 blkCnt--;
209 }
210
211 /*
212 * output is in 11.21(q21) format for the 1024 point
213 * output is in 9.23(q23) format for the 256 point
214 * output is in 7.25(q25) format for the 64 point
215 * output is in 5.27(q27) format for the 16 point
216 */
217 }
218
219
arm_cfft_radix4by2_q31_mve(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint32_t fftLen)220 static void arm_cfft_radix4by2_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen)
221 {
222 uint32_t n2;
223 q31_t *pIn0;
224 q31_t *pIn1;
225 const q31_t *pCoef = S->pTwiddle;
226 uint32_t blkCnt;
227 q31x4_t vecIn0, vecIn1, vecSum, vecDiff;
228 q31x4_t vecCmplxTmp, vecTw;
229
230 n2 = fftLen >> 1;
231 pIn0 = pSrc;
232 pIn1 = pSrc + fftLen;
233
234 blkCnt = n2 / 2;
235
236 while (blkCnt > 0U)
237 {
238 vecIn0 = vld1q_s32(pIn0);
239 vecIn1 = vld1q_s32(pIn1);
240
241 vecIn0 = vecIn0 >> 1;
242 vecIn1 = vecIn1 >> 1;
243 vecSum = vhaddq(vecIn0, vecIn1);
244 vst1q(pIn0, vecSum);
245 pIn0 += 4;
246
247 vecTw = vld1q_s32(pCoef);
248 pCoef += 4;
249 vecDiff = vhsubq(vecIn0, vecIn1);
250
251 vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw);
252 vst1q(pIn1, vecCmplxTmp);
253 pIn1 += 4;
254
255 blkCnt--;
256 }
257
258 _arm_radix4_butterfly_q31_mve(S, pSrc, n2);
259
260 _arm_radix4_butterfly_q31_mve(S, pSrc + fftLen, n2);
261
262 pIn0 = pSrc;
263 blkCnt = (fftLen << 1) >> 2;
264 while (blkCnt > 0U)
265 {
266 vecIn0 = vld1q_s32(pIn0);
267 vecIn0 = vecIn0 << 1;
268 vst1q(pIn0, vecIn0);
269 pIn0 += 4;
270 blkCnt--;
271 }
272 /*
273 * tail
274 * (will be merged thru tail predication)
275 */
276 blkCnt = (fftLen << 1) & 3;
277 if (blkCnt > 0U)
278 {
279 mve_pred16_t p0 = vctp32q(blkCnt);
280
281 vecIn0 = vld1q_s32(pIn0);
282 vecIn0 = vecIn0 << 1;
283 vstrwq_p(pIn0, vecIn0, p0);
284 }
285
286 }
287
_arm_radix4_butterfly_inverse_q31_mve(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint32_t fftLen)288 static void _arm_radix4_butterfly_inverse_q31_mve(
289 const arm_cfft_instance_q31 *S,
290 q31_t *pSrc,
291 uint32_t fftLen)
292 {
293 q31x4_t vecTmp0, vecTmp1;
294 q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
295 q31x4_t vecA, vecB, vecC, vecD;
296 q31x4_t vecW;
297 uint32_t blkCnt;
298 uint32_t n1, n2;
299 uint32_t stage = 0;
300 int32_t iter = 1;
301 static const uint32_t strides[4] = {
302 (0 - 16) * sizeof(q31_t *), (1 - 16) * sizeof(q31_t *),
303 (8 - 16) * sizeof(q31_t *), (9 - 16) * sizeof(q31_t *)
304 };
305
306 /*
307 * Process first stages
308 * Each stage in middle stages provides two down scaling of the input
309 */
310 n2 = fftLen;
311 n1 = n2;
312 n2 >>= 2u;
313
314 for (int k = fftLen / 4u; k > 1; k >>= 2u)
315 {
316 for (int i = 0; i < iter; i++)
317 {
318 q31_t const *p_rearranged_twiddle_tab_stride2 =
319 &S->rearranged_twiddle_stride2[
320 S->rearranged_twiddle_tab_stride2_arr[stage]];
321 q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
322 S->rearranged_twiddle_tab_stride3_arr[stage]];
323 q31_t const *p_rearranged_twiddle_tab_stride1 =
324 &S->rearranged_twiddle_stride1[
325 S->rearranged_twiddle_tab_stride1_arr[stage]];
326
327 q31_t const *pW1, *pW2, *pW3;
328 q31_t *inA = pSrc + CMPLX_DIM * i * n1;
329 q31_t *inB = inA + n2 * CMPLX_DIM;
330 q31_t *inC = inB + n2 * CMPLX_DIM;
331 q31_t *inD = inC + n2 * CMPLX_DIM;
332
333 pW1 = p_rearranged_twiddle_tab_stride1;
334 pW2 = p_rearranged_twiddle_tab_stride2;
335 pW3 = p_rearranged_twiddle_tab_stride3;
336
337 blkCnt = n2 / 2;
338 /*
339 * load 2 x q31 complex pair
340 */
341 vecA = vldrwq_s32(inA);
342 vecC = vldrwq_s32(inC);
343 while (blkCnt > 0U)
344 {
345 vecB = vldrwq_s32(inB);
346 vecD = vldrwq_s32(inD);
347
348 vecSum0 = vhaddq(vecA, vecC);
349 vecDiff0 = vhsubq(vecA, vecC);
350
351 vecSum1 = vhaddq(vecB, vecD);
352 vecDiff1 = vhsubq(vecB, vecD);
353 /*
354 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
355 */
356 vecTmp0 = vhaddq(vecSum0, vecSum1);
357 vst1q(inA, vecTmp0);
358 inA += 4;
359 /*
360 * [ 1 -1 1 -1 ] * [ A B C D ]'
361 */
362 vecTmp0 = vhsubq(vecSum0, vecSum1);
363 /*
364 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
365 */
366 vecW = vld1q(pW2);
367 pW2 += 4;
368 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
369
370 vst1q(inB, vecTmp1);
371 inB += 4;
372 /*
373 * [ 1 -i -1 +i ] * [ A B C D ]'
374 */
375 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
376 /*
377 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
378 */
379 vecW = vld1q(pW1);
380 pW1 += 4;
381 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
382 vst1q(inC, vecTmp1);
383 inC += 4;
384 /*
385 * [ 1 +i -1 -i ] * [ A B C D ]'
386 */
387 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
388 /*
389 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
390 */
391 vecW = vld1q(pW3);
392 pW3 += 4;
393 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
394 vst1q(inD, vecTmp1);
395 inD += 4;
396
397 vecA = vldrwq_s32(inA);
398 vecC = vldrwq_s32(inC);
399
400 blkCnt--;
401 }
402 }
403 n1 = n2;
404 n2 >>= 2u;
405 iter = iter << 2;
406 stage++;
407 }
408
409 /*
410 * End of 1st stages process
411 * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages
412 * data is in 9.23(q23) format for the 256 point as there are 2 middle stages
413 * data is in 7.25(q25) format for the 64 point as there are 1 middle stage
414 * data is in 5.27(q27) format for the 16 point as there are no middle stages
415 */
416
417 /*
418 * start of Last stage process
419 */
420 uint32x4_t vecScGathAddr = vld1q_u32(strides);
421 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
422
423 /*
424 * load scheduling
425 */
426 vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
427 vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
428
429 blkCnt = (fftLen >> 3);
430 while (blkCnt > 0U)
431 {
432 vecSum0 = vhaddq(vecA, vecC);
433 vecDiff0 = vhsubq(vecA, vecC);
434
435 vecB = vldrwq_gather_base_s32(vecScGathAddr, 8);
436 vecD = vldrwq_gather_base_s32(vecScGathAddr, 24);
437
438 vecSum1 = vhaddq(vecB, vecD);
439 vecDiff1 = vhsubq(vecB, vecD);
440 /*
441 * pre-load for next iteration
442 */
443 vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
444 vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
445
446 vecTmp0 = vhaddq(vecSum0, vecSum1);
447 vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0);
448
449 vecTmp0 = vhsubq(vecSum0, vecSum1);
450 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0);
451
452 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
453 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0);
454
455 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
456 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0);
457
458 blkCnt--;
459 }
460 /*
461 * output is in 11.21(q21) format for the 1024 point
462 * output is in 9.23(q23) format for the 256 point
463 * output is in 7.25(q25) format for the 64 point
464 * output is in 5.27(q27) format for the 16 point
465 */
466 }
467
arm_cfft_radix4by2_inverse_q31_mve(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint32_t fftLen)468 static void arm_cfft_radix4by2_inverse_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen)
469 {
470 uint32_t n2;
471 q31_t *pIn0;
472 q31_t *pIn1;
473 const q31_t *pCoef = S->pTwiddle;
474
475 //uint16_t twidCoefModifier = arm_cfft_radix2_twiddle_factor(S->fftLen);
476 //q31_t twidIncr = (2 * twidCoefModifier * sizeof(q31_t));
477 uint32_t blkCnt;
478 //uint64x2_t vecOffs;
479 q31x4_t vecIn0, vecIn1, vecSum, vecDiff;
480 q31x4_t vecCmplxTmp, vecTw;
481
482 n2 = fftLen >> 1;
483
484 pIn0 = pSrc;
485 pIn1 = pSrc + fftLen;
486 //vecOffs[0] = 0;
487 //vecOffs[1] = (uint64_t) twidIncr;
488 blkCnt = n2 / 2;
489
490 while (blkCnt > 0U)
491 {
492 vecIn0 = vld1q_s32(pIn0);
493 vecIn1 = vld1q_s32(pIn1);
494
495 vecIn0 = vecIn0 >> 1;
496 vecIn1 = vecIn1 >> 1;
497 vecSum = vhaddq(vecIn0, vecIn1);
498 vst1q(pIn0, vecSum);
499 pIn0 += 4;
500
501 //vecTw = (q31x4_t) vldrdq_gather_offset_s64(pCoef, vecOffs);
502 vecTw = vld1q_s32(pCoef);
503 pCoef += 4;
504 vecDiff = vhsubq(vecIn0, vecIn1);
505
506 vecCmplxTmp = MVE_CMPLX_MULT_FX_AxB(vecDiff, vecTw);
507 vst1q(pIn1, vecCmplxTmp);
508 pIn1 += 4;
509
510 //vecOffs = vaddq((q31x4_t) vecOffs, 2 * twidIncr);
511 blkCnt--;
512 }
513
514 _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, n2);
515
516 _arm_radix4_butterfly_inverse_q31_mve(S, pSrc + fftLen, n2);
517
518 pIn0 = pSrc;
519 blkCnt = (fftLen << 1) >> 2;
520 while (blkCnt > 0U)
521 {
522 vecIn0 = vld1q_s32(pIn0);
523 vecIn0 = vecIn0 << 1;
524 vst1q(pIn0, vecIn0);
525 pIn0 += 4;
526 blkCnt--;
527 }
528 /*
529 * tail
530 * (will be merged thru tail predication)
531 */
532 blkCnt = (fftLen << 1) & 3;
533 if (blkCnt > 0U)
534 {
535 mve_pred16_t p0 = vctp32q(blkCnt);
536
537 vecIn0 = vld1q_s32(pIn0);
538 vecIn0 = vecIn0 << 1;
539 vstrwq_p(pIn0, vecIn0, p0);
540 }
541
542 }
543
544 /**
545 @ingroup groupTransforms
546 */
547
548 /**
549 @addtogroup ComplexFFT
550 @{
551 */
552
553 /**
554 @brief Processing function for the Q31 complex FFT.
555 @param[in] S points to an instance of the fixed-point CFFT structure
556 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
557 @param[in] ifftFlag flag that selects transform direction
558 - value = 0: forward transform
559 - value = 1: inverse transform
560 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
561 - value = 0: disables bit reversal of output
562 - value = 1: enables bit reversal of output
563 @return none
564 */
arm_cfft_q31(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)565 void arm_cfft_q31(
566 const arm_cfft_instance_q31 * S,
567 q31_t * pSrc,
568 uint8_t ifftFlag,
569 uint8_t bitReverseFlag)
570 {
571 uint32_t fftLen = S->fftLen;
572
573 if (ifftFlag == 1U) {
574
575 switch (fftLen) {
576 case 16:
577 case 64:
578 case 256:
579 case 1024:
580 case 4096:
581 _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
582 break;
583
584 case 32:
585 case 128:
586 case 512:
587 case 2048:
588 arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
589 break;
590 }
591 } else {
592 switch (fftLen) {
593 case 16:
594 case 64:
595 case 256:
596 case 1024:
597 case 4096:
598 _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
599 break;
600
601 case 32:
602 case 128:
603 case 512:
604 case 2048:
605 arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
606 break;
607 }
608 }
609
610
611 if (bitReverseFlag)
612 {
613
614 arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
615
616 }
617 }
618 #else
619
620 extern void arm_radix4_butterfly_q31(
621 q31_t * pSrc,
622 uint32_t fftLen,
623 const q31_t * pCoef,
624 uint32_t twidCoefModifier);
625
626 extern void arm_radix4_butterfly_inverse_q31(
627 q31_t * pSrc,
628 uint32_t fftLen,
629 const q31_t * pCoef,
630 uint32_t twidCoefModifier);
631
632 extern void arm_bitreversal_32(
633 uint32_t * pSrc,
634 const uint16_t bitRevLen,
635 const uint16_t * pBitRevTable);
636
637 void arm_cfft_radix4by2_q31(
638 q31_t * pSrc,
639 uint32_t fftLen,
640 const q31_t * pCoef);
641
642 void arm_cfft_radix4by2_inverse_q31(
643 q31_t * pSrc,
644 uint32_t fftLen,
645 const q31_t * pCoef);
646
647
648 /**
649 @ingroup groupTransforms
650 */
651
652 /**
653 @addtogroup ComplexFFT
654 @{
655 */
656
657 /**
658 @brief Processing function for the Q31 complex FFT.
659 @param[in] S points to an instance of the fixed-point CFFT structure
660 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
661 @param[in] ifftFlag flag that selects transform direction
662 - value = 0: forward transform
663 - value = 1: inverse transform
664 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
665 - value = 0: disables bit reversal of output
666 - value = 1: enables bit reversal of output
667 @return none
668 */
arm_cfft_q31(const arm_cfft_instance_q31 * S,q31_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)669 void arm_cfft_q31(
670 const arm_cfft_instance_q31 * S,
671 q31_t * p1,
672 uint8_t ifftFlag,
673 uint8_t bitReverseFlag)
674 {
675 uint32_t L = S->fftLen;
676
677 if (ifftFlag == 1U)
678 {
679 switch (L)
680 {
681 case 16:
682 case 64:
683 case 256:
684 case 1024:
685 case 4096:
686 arm_radix4_butterfly_inverse_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
687 break;
688
689 case 32:
690 case 128:
691 case 512:
692 case 2048:
693 arm_cfft_radix4by2_inverse_q31 ( p1, L, S->pTwiddle );
694 break;
695 }
696 }
697 else
698 {
699 switch (L)
700 {
701 case 16:
702 case 64:
703 case 256:
704 case 1024:
705 case 4096:
706 arm_radix4_butterfly_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
707 break;
708
709 case 32:
710 case 128:
711 case 512:
712 case 2048:
713 arm_cfft_radix4by2_q31 ( p1, L, S->pTwiddle );
714 break;
715 }
716 }
717
718 if ( bitReverseFlag )
719 arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
720 }
721
722 /**
723 @} end of ComplexFFT group
724 */
725
arm_cfft_radix4by2_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pCoef)726 void arm_cfft_radix4by2_q31(
727 q31_t * pSrc,
728 uint32_t fftLen,
729 const q31_t * pCoef)
730 {
731 uint32_t i, l;
732 uint32_t n2;
733 q31_t xt, yt, cosVal, sinVal;
734 q31_t p0, p1;
735
736 n2 = fftLen >> 1U;
737 for (i = 0; i < n2; i++)
738 {
739 cosVal = pCoef[2 * i];
740 sinVal = pCoef[2 * i + 1];
741
742 l = i + n2;
743
744 xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
745 pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
746
747 yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
748 pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
749
750 mult_32x32_keep32_R(p0, xt, cosVal);
751 mult_32x32_keep32_R(p1, yt, cosVal);
752 multAcc_32x32_keep32_R(p0, yt, sinVal);
753 multSub_32x32_keep32_R(p1, xt, sinVal);
754
755 pSrc[2 * l] = p0 << 1;
756 pSrc[2 * l + 1] = p1 << 1;
757 }
758
759
760 /* first col */
761 arm_radix4_butterfly_q31 (pSrc, n2, (q31_t*)pCoef, 2U);
762
763 /* second col */
764 arm_radix4_butterfly_q31 (pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
765
766 n2 = fftLen >> 1U;
767 for (i = 0; i < n2; i++)
768 {
769 p0 = pSrc[4 * i + 0];
770 p1 = pSrc[4 * i + 1];
771 xt = pSrc[4 * i + 2];
772 yt = pSrc[4 * i + 3];
773
774 p0 <<= 1U;
775 p1 <<= 1U;
776 xt <<= 1U;
777 yt <<= 1U;
778
779 pSrc[4 * i + 0] = p0;
780 pSrc[4 * i + 1] = p1;
781 pSrc[4 * i + 2] = xt;
782 pSrc[4 * i + 3] = yt;
783 }
784
785 }
786
arm_cfft_radix4by2_inverse_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pCoef)787 void arm_cfft_radix4by2_inverse_q31(
788 q31_t * pSrc,
789 uint32_t fftLen,
790 const q31_t * pCoef)
791 {
792 uint32_t i, l;
793 uint32_t n2;
794 q31_t xt, yt, cosVal, sinVal;
795 q31_t p0, p1;
796
797 n2 = fftLen >> 1U;
798 for (i = 0; i < n2; i++)
799 {
800 cosVal = pCoef[2 * i];
801 sinVal = pCoef[2 * i + 1];
802
803 l = i + n2;
804
805 xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
806 pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
807
808 yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
809 pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
810
811 mult_32x32_keep32_R(p0, xt, cosVal);
812 mult_32x32_keep32_R(p1, yt, cosVal);
813 multSub_32x32_keep32_R(p0, yt, sinVal);
814 multAcc_32x32_keep32_R(p1, xt, sinVal);
815
816 pSrc[2 * l] = p0 << 1U;
817 pSrc[2 * l + 1] = p1 << 1U;
818 }
819
820 /* first col */
821 arm_radix4_butterfly_inverse_q31( pSrc, n2, (q31_t*)pCoef, 2U);
822
823 /* second col */
824 arm_radix4_butterfly_inverse_q31( pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
825
826 n2 = fftLen >> 1U;
827 for (i = 0; i < n2; i++)
828 {
829 p0 = pSrc[4 * i + 0];
830 p1 = pSrc[4 * i + 1];
831 xt = pSrc[4 * i + 2];
832 yt = pSrc[4 * i + 3];
833
834 p0 <<= 1U;
835 p1 <<= 1U;
836 xt <<= 1U;
837 yt <<= 1U;
838
839 pSrc[4 * i + 0] = p0;
840 pSrc[4 * i + 1] = p1;
841 pSrc[4 * i + 2] = xt;
842 pSrc[4 * i + 3] = yt;
843 }
844 }
845 #endif /* defined(ARM_MATH_MVEI) */
846