1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cfft_q31.c
4 * Description: Combined Radix Decimation in Frequency CFFT fixed point processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions.h"
30
31
32
33 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
34
35 #include "arm_vec_fft.h"
36
37
_arm_radix4_butterfly_q31_mve(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint32_t fftLen)38 static void _arm_radix4_butterfly_q31_mve(
39 const arm_cfft_instance_q31 * S,
40 q31_t *pSrc,
41 uint32_t fftLen)
42 {
43 q31x4_t vecTmp0, vecTmp1;
44 q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
45 q31x4_t vecA, vecB, vecC, vecD;
46 uint32_t blkCnt;
47 uint32_t n1, n2;
48 uint32_t stage = 0;
49 int32_t iter = 1;
50 static const int32_t strides[4] = {
51 (0 - 16) * (int32_t)sizeof(q31_t *), (1 - 16) * (int32_t)sizeof(q31_t *),
52 (8 - 16) * (int32_t)sizeof(q31_t *), (9 - 16) * (int32_t)sizeof(q31_t *)
53 };
54
55
56 /*
57 * Process first stages
58 * Each stage in middle stages provides two down scaling of the input
59 */
60 n2 = fftLen;
61 n1 = n2;
62 n2 >>= 2u;
63
64 for (int k = fftLen / 4u; k > 1; k >>= 2u)
65 {
66 q31_t const *p_rearranged_twiddle_tab_stride2 =
67 &S->rearranged_twiddle_stride2[
68 S->rearranged_twiddle_tab_stride2_arr[stage]];
69 q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
70 S->rearranged_twiddle_tab_stride3_arr[stage]];
71 q31_t const *p_rearranged_twiddle_tab_stride1 =
72 &S->rearranged_twiddle_stride1[
73 S->rearranged_twiddle_tab_stride1_arr[stage]];
74
75 q31_t * pBase = pSrc;
76 for (int i = 0; i < iter; i++)
77 {
78 q31_t *inA = pBase;
79 q31_t *inB = inA + n2 * CMPLX_DIM;
80 q31_t *inC = inB + n2 * CMPLX_DIM;
81 q31_t *inD = inC + n2 * CMPLX_DIM;
82 q31_t const *pW1 = p_rearranged_twiddle_tab_stride1;
83 q31_t const *pW2 = p_rearranged_twiddle_tab_stride2;
84 q31_t const *pW3 = p_rearranged_twiddle_tab_stride3;
85 q31x4_t vecW;
86
87
88 blkCnt = n2 / 2;
89 /*
90 * load 2 x q31 complex pair
91 */
92 vecA = vldrwq_s32(inA);
93 vecC = vldrwq_s32(inC);
94 while (blkCnt > 0U)
95 {
96 vecB = vldrwq_s32(inB);
97 vecD = vldrwq_s32(inD);
98
99 vecSum0 = vhaddq(vecA, vecC);
100 vecDiff0 = vhsubq(vecA, vecC);
101
102 vecSum1 = vhaddq(vecB, vecD);
103 vecDiff1 = vhsubq(vecB, vecD);
104 /*
105 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
106 */
107 vecTmp0 = vhaddq(vecSum0, vecSum1);
108 vst1q(inA, vecTmp0);
109 inA += 4;
110 /*
111 * [ 1 -1 1 -1 ] * [ A B C D ]'
112 */
113 vecTmp0 = vhsubq(vecSum0, vecSum1);
114 /*
115 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
116 */
117 vecW = vld1q(pW2);
118 pW2 += 4;
119 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q31x4_t);
120
121 vst1q(inB, vecTmp1);
122 inB += 4;
123 /*
124 * [ 1 -i -1 +i ] * [ A B C D ]'
125 */
126 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
127 /*
128 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
129 */
130 vecW = vld1q(pW1);
131 pW1 += 4;
132 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q31x4_t);
133 vst1q(inC, vecTmp1);
134 inC += 4;
135 /*
136 * [ 1 +i -1 -i ] * [ A B C D ]'
137 */
138 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
139 /*
140 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
141 */
142 vecW = vld1q(pW3);
143 pW3 += 4;
144 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q31x4_t);
145 vst1q(inD, vecTmp1);
146 inD += 4;
147
148 vecA = vldrwq_s32(inA);
149 vecC = vldrwq_s32(inC);
150
151 blkCnt--;
152 }
153 pBase += CMPLX_DIM * n1;
154 }
155 n1 = n2;
156 n2 >>= 2u;
157 iter = iter << 2;
158 stage++;
159 }
160
161 /*
162 * End of 1st stages process
163 * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages
164 * data is in 9.23(q23) format for the 256 point as there are 2 middle stages
165 * data is in 7.25(q25) format for the 64 point as there are 1 middle stage
166 * data is in 5.27(q27) format for the 16 point as there are no middle stages
167 */
168
169 /*
170 * start of Last stage process
171 */
172 uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
173 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
174
175 /*
176 * load scheduling
177 */
178 vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
179 vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
180
181 blkCnt = (fftLen >> 3);
182 while (blkCnt > 0U)
183 {
184 vecSum0 = vhaddq(vecA, vecC);
185 vecDiff0 = vhsubq(vecA, vecC);
186
187 vecB = vldrwq_gather_base_s32(vecScGathAddr, 8);
188 vecD = vldrwq_gather_base_s32(vecScGathAddr, 24);
189
190 vecSum1 = vhaddq(vecB, vecD);
191 vecDiff1 = vhsubq(vecB, vecD);
192 /*
193 * pre-load for next iteration
194 */
195 vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
196 vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
197
198 vecTmp0 = vhaddq(vecSum0, vecSum1);
199 vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0);
200
201 vecTmp0 = vhsubq(vecSum0, vecSum1);
202 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0);
203
204 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
205 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0);
206
207 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
208 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0);
209
210 blkCnt--;
211 }
212
213 /*
214 * output is in 11.21(q21) format for the 1024 point
215 * output is in 9.23(q23) format for the 256 point
216 * output is in 7.25(q25) format for the 64 point
217 * output is in 5.27(q27) format for the 16 point
218 */
219 }
220
221
arm_cfft_radix4by2_q31_mve(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint32_t fftLen)222 static void arm_cfft_radix4by2_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen)
223 {
224 uint32_t n2;
225 q31_t *pIn0;
226 q31_t *pIn1;
227 const q31_t *pCoef = S->pTwiddle;
228 uint32_t blkCnt;
229 q31x4_t vecIn0, vecIn1, vecSum, vecDiff;
230 q31x4_t vecCmplxTmp, vecTw;
231
232 n2 = fftLen >> 1;
233 pIn0 = pSrc;
234 pIn1 = pSrc + fftLen;
235
236 blkCnt = n2 / 2;
237
238 while (blkCnt > 0U)
239 {
240 vecIn0 = vld1q_s32(pIn0);
241 vecIn1 = vld1q_s32(pIn1);
242
243 vecIn0 = vecIn0 >> 1;
244 vecIn1 = vecIn1 >> 1;
245 vecSum = vhaddq(vecIn0, vecIn1);
246 vst1q(pIn0, vecSum);
247 pIn0 += 4;
248
249 vecTw = vld1q_s32(pCoef);
250 pCoef += 4;
251 vecDiff = vhsubq(vecIn0, vecIn1);
252
253 vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw, q31x4_t);
254 vst1q(pIn1, vecCmplxTmp);
255 pIn1 += 4;
256
257 blkCnt--;
258 }
259
260 _arm_radix4_butterfly_q31_mve(S, pSrc, n2);
261
262 _arm_radix4_butterfly_q31_mve(S, pSrc + fftLen, n2);
263
264 pIn0 = pSrc;
265 blkCnt = (fftLen << 1) >> 2;
266 while (blkCnt > 0U)
267 {
268 vecIn0 = vld1q_s32(pIn0);
269 vecIn0 = vecIn0 << 1;
270 vst1q(pIn0, vecIn0);
271 pIn0 += 4;
272 blkCnt--;
273 }
274 /*
275 * tail
276 * (will be merged thru tail predication)
277 */
278 blkCnt = (fftLen << 1) & 3;
279 if (blkCnt > 0U)
280 {
281 mve_pred16_t p0 = vctp32q(blkCnt);
282
283 vecIn0 = vld1q_s32(pIn0);
284 vecIn0 = vecIn0 << 1;
285 vstrwq_p(pIn0, vecIn0, p0);
286 }
287
288 }
289
_arm_radix4_butterfly_inverse_q31_mve(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint32_t fftLen)290 static void _arm_radix4_butterfly_inverse_q31_mve(
291 const arm_cfft_instance_q31 *S,
292 q31_t *pSrc,
293 uint32_t fftLen)
294 {
295 q31x4_t vecTmp0, vecTmp1;
296 q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
297 q31x4_t vecA, vecB, vecC, vecD;
298 uint32_t blkCnt;
299 uint32_t n1, n2;
300 uint32_t stage = 0;
301 int32_t iter = 1;
302 static const int32_t strides[4] = {
303 (0 - 16) * (int32_t)sizeof(q31_t *), (1 - 16) * (int32_t)sizeof(q31_t *),
304 (8 - 16) * (int32_t)sizeof(q31_t *), (9 - 16) * (int32_t)sizeof(q31_t *)
305 };
306
307 /*
308 * Process first stages
309 * Each stage in middle stages provides two down scaling of the input
310 */
311 n2 = fftLen;
312 n1 = n2;
313 n2 >>= 2u;
314
315 for (int k = fftLen / 4u; k > 1; k >>= 2u)
316 {
317 q31_t const *p_rearranged_twiddle_tab_stride2 =
318 &S->rearranged_twiddle_stride2[
319 S->rearranged_twiddle_tab_stride2_arr[stage]];
320 q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
321 S->rearranged_twiddle_tab_stride3_arr[stage]];
322 q31_t const *p_rearranged_twiddle_tab_stride1 =
323 &S->rearranged_twiddle_stride1[
324 S->rearranged_twiddle_tab_stride1_arr[stage]];
325
326 q31_t * pBase = pSrc;
327 for (int i = 0; i < iter; i++)
328 {
329 q31_t *inA = pBase;
330 q31_t *inB = inA + n2 * CMPLX_DIM;
331 q31_t *inC = inB + n2 * CMPLX_DIM;
332 q31_t *inD = inC + n2 * CMPLX_DIM;
333 q31_t const *pW1 = p_rearranged_twiddle_tab_stride1;
334 q31_t const *pW2 = p_rearranged_twiddle_tab_stride2;
335 q31_t const *pW3 = p_rearranged_twiddle_tab_stride3;
336 q31x4_t vecW;
337
338 blkCnt = n2 / 2;
339 /*
340 * load 2 x q31 complex pair
341 */
342 vecA = vldrwq_s32(inA);
343 vecC = vldrwq_s32(inC);
344 while (blkCnt > 0U)
345 {
346 vecB = vldrwq_s32(inB);
347 vecD = vldrwq_s32(inD);
348
349 vecSum0 = vhaddq(vecA, vecC);
350 vecDiff0 = vhsubq(vecA, vecC);
351
352 vecSum1 = vhaddq(vecB, vecD);
353 vecDiff1 = vhsubq(vecB, vecD);
354 /*
355 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
356 */
357 vecTmp0 = vhaddq(vecSum0, vecSum1);
358 vst1q(inA, vecTmp0);
359 inA += 4;
360 /*
361 * [ 1 -1 1 -1 ] * [ A B C D ]'
362 */
363 vecTmp0 = vhsubq(vecSum0, vecSum1);
364 /*
365 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
366 */
367 vecW = vld1q(pW2);
368 pW2 += 4;
369 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q31x4_t);
370
371 vst1q(inB, vecTmp1);
372 inB += 4;
373 /*
374 * [ 1 -i -1 +i ] * [ A B C D ]'
375 */
376 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
377 /*
378 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
379 */
380 vecW = vld1q(pW1);
381 pW1 += 4;
382 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q31x4_t);
383 vst1q(inC, vecTmp1);
384 inC += 4;
385 /*
386 * [ 1 +i -1 -i ] * [ A B C D ]'
387 */
388 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
389 /*
390 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
391 */
392 vecW = vld1q(pW3);
393 pW3 += 4;
394 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q31x4_t);
395 vst1q(inD, vecTmp1);
396 inD += 4;
397
398 vecA = vldrwq_s32(inA);
399 vecC = vldrwq_s32(inC);
400
401 blkCnt--;
402 }
403 pBase += CMPLX_DIM * n1;
404 }
405 n1 = n2;
406 n2 >>= 2u;
407 iter = iter << 2;
408 stage++;
409 }
410
411 /*
412 * End of 1st stages process
413 * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages
414 * data is in 9.23(q23) format for the 256 point as there are 2 middle stages
415 * data is in 7.25(q25) format for the 64 point as there are 1 middle stage
416 * data is in 5.27(q27) format for the 16 point as there are no middle stages
417 */
418
419 /*
420 * start of Last stage process
421 */
422 uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
423 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
424
425 /*
426 * load scheduling
427 */
428 vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
429 vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
430
431 blkCnt = (fftLen >> 3);
432 while (blkCnt > 0U)
433 {
434 vecSum0 = vhaddq(vecA, vecC);
435 vecDiff0 = vhsubq(vecA, vecC);
436
437 vecB = vldrwq_gather_base_s32(vecScGathAddr, 8);
438 vecD = vldrwq_gather_base_s32(vecScGathAddr, 24);
439
440 vecSum1 = vhaddq(vecB, vecD);
441 vecDiff1 = vhsubq(vecB, vecD);
442 /*
443 * pre-load for next iteration
444 */
445 vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
446 vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
447
448 vecTmp0 = vhaddq(vecSum0, vecSum1);
449 vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0);
450
451 vecTmp0 = vhsubq(vecSum0, vecSum1);
452 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0);
453
454 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
455 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0);
456
457 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
458 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0);
459
460 blkCnt--;
461 }
462 /*
463 * output is in 11.21(q21) format for the 1024 point
464 * output is in 9.23(q23) format for the 256 point
465 * output is in 7.25(q25) format for the 64 point
466 * output is in 5.27(q27) format for the 16 point
467 */
468 }
469
arm_cfft_radix4by2_inverse_q31_mve(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint32_t fftLen)470 static void arm_cfft_radix4by2_inverse_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen)
471 {
472 uint32_t n2;
473 q31_t *pIn0;
474 q31_t *pIn1;
475 const q31_t *pCoef = S->pTwiddle;
476
477 //uint16_t twidCoefModifier = arm_cfft_radix2_twiddle_factor(S->fftLen);
478 //q31_t twidIncr = (2 * twidCoefModifier * sizeof(q31_t));
479 uint32_t blkCnt;
480 //uint64x2_t vecOffs;
481 q31x4_t vecIn0, vecIn1, vecSum, vecDiff;
482 q31x4_t vecCmplxTmp, vecTw;
483
484 n2 = fftLen >> 1;
485
486 pIn0 = pSrc;
487 pIn1 = pSrc + fftLen;
488 //vecOffs[0] = 0;
489 //vecOffs[1] = (uint64_t) twidIncr;
490 blkCnt = n2 / 2;
491
492 while (blkCnt > 0U)
493 {
494 vecIn0 = vld1q_s32(pIn0);
495 vecIn1 = vld1q_s32(pIn1);
496
497 vecIn0 = vecIn0 >> 1;
498 vecIn1 = vecIn1 >> 1;
499 vecSum = vhaddq(vecIn0, vecIn1);
500 vst1q(pIn0, vecSum);
501 pIn0 += 4;
502
503 //vecTw = (q31x4_t) vldrdq_gather_offset_s64(pCoef, vecOffs);
504 vecTw = vld1q_s32(pCoef);
505 pCoef += 4;
506 vecDiff = vhsubq(vecIn0, vecIn1);
507
508 vecCmplxTmp = MVE_CMPLX_MULT_FX_AxB(vecDiff, vecTw, q31x4_t);
509 vst1q(pIn1, vecCmplxTmp);
510 pIn1 += 4;
511
512 //vecOffs = vaddq((q31x4_t) vecOffs, 2 * twidIncr);
513 blkCnt--;
514 }
515
516 _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, n2);
517
518 _arm_radix4_butterfly_inverse_q31_mve(S, pSrc + fftLen, n2);
519
520 pIn0 = pSrc;
521 blkCnt = (fftLen << 1) >> 2;
522 while (blkCnt > 0U)
523 {
524 vecIn0 = vld1q_s32(pIn0);
525 vecIn0 = vecIn0 << 1;
526 vst1q(pIn0, vecIn0);
527 pIn0 += 4;
528 blkCnt--;
529 }
530 /*
531 * tail
532 * (will be merged thru tail predication)
533 */
534 blkCnt = (fftLen << 1) & 3;
535 if (blkCnt > 0U)
536 {
537 mve_pred16_t p0 = vctp32q(blkCnt);
538
539 vecIn0 = vld1q_s32(pIn0);
540 vecIn0 = vecIn0 << 1;
541 vstrwq_p(pIn0, vecIn0, p0);
542 }
543
544 }
545
546
547 /**
548 @addtogroup ComplexFFTQ31
549 @{
550 */
551
552 /**
553 @brief Processing function for the Q31 complex FFT.
554 @param[in] S points to an instance of the fixed-point CFFT structure
555 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
556 @param[in] ifftFlag flag that selects transform direction
557 - value = 0: forward transform
558 - value = 1: inverse transform
559 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
560 - value = 0: disables bit reversal of output
561 - value = 1: enables bit reversal of output
562 */
arm_cfft_q31(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)563 ARM_DSP_ATTRIBUTE void arm_cfft_q31(
564 const arm_cfft_instance_q31 * S,
565 q31_t * pSrc,
566 uint8_t ifftFlag,
567 uint8_t bitReverseFlag)
568 {
569 uint32_t fftLen = S->fftLen;
570
571 if (ifftFlag == 1U) {
572
573 switch (fftLen) {
574 case 16:
575 case 64:
576 case 256:
577 case 1024:
578 case 4096:
579 _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
580 break;
581
582 case 32:
583 case 128:
584 case 512:
585 case 2048:
586 arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
587 break;
588 }
589 } else {
590 switch (fftLen) {
591 case 16:
592 case 64:
593 case 256:
594 case 1024:
595 case 4096:
596 _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
597 break;
598
599 case 32:
600 case 128:
601 case 512:
602 case 2048:
603 arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
604 break;
605 }
606 }
607
608
609 if (bitReverseFlag)
610 {
611
612 arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
613
614 }
615 }
616 #else
617
618 extern void arm_radix4_butterfly_q31(
619 q31_t * pSrc,
620 uint32_t fftLen,
621 const q31_t * pCoef,
622 uint32_t twidCoefModifier);
623
624 extern void arm_radix4_butterfly_inverse_q31(
625 q31_t * pSrc,
626 uint32_t fftLen,
627 const q31_t * pCoef,
628 uint32_t twidCoefModifier);
629
630 extern void arm_bitreversal_32(
631 uint32_t * pSrc,
632 const uint16_t bitRevLen,
633 const uint16_t * pBitRevTable);
634
635 ARM_DSP_ATTRIBUTE void arm_cfft_radix4by2_q31(
636 q31_t * pSrc,
637 uint32_t fftLen,
638 const q31_t * pCoef);
639
640 ARM_DSP_ATTRIBUTE void arm_cfft_radix4by2_inverse_q31(
641 q31_t * pSrc,
642 uint32_t fftLen,
643 const q31_t * pCoef);
644
645
646 /**
647 @addtogroup ComplexFFTQ31
648 @{
649 */
650
651 /**
652 @brief Processing function for the Q31 complex FFT.
653 @param[in] S points to an instance of the fixed-point CFFT structure
654 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
655 @param[in] ifftFlag flag that selects transform direction
656 - value = 0: forward transform
657 - value = 1: inverse transform
658 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
659 - value = 0: disables bit reversal of output
660 - value = 1: enables bit reversal of output
661 */
arm_cfft_q31(const arm_cfft_instance_q31 * S,q31_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)662 ARM_DSP_ATTRIBUTE void arm_cfft_q31(
663 const arm_cfft_instance_q31 * S,
664 q31_t * p1,
665 uint8_t ifftFlag,
666 uint8_t bitReverseFlag)
667 {
668 uint32_t L = S->fftLen;
669
670 if (ifftFlag == 1U)
671 {
672 switch (L)
673 {
674 case 16:
675 case 64:
676 case 256:
677 case 1024:
678 case 4096:
679 arm_radix4_butterfly_inverse_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
680 break;
681
682 case 32:
683 case 128:
684 case 512:
685 case 2048:
686 arm_cfft_radix4by2_inverse_q31 ( p1, L, S->pTwiddle );
687 break;
688 }
689 }
690 else
691 {
692 switch (L)
693 {
694 case 16:
695 case 64:
696 case 256:
697 case 1024:
698 case 4096:
699 arm_radix4_butterfly_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
700 break;
701
702 case 32:
703 case 128:
704 case 512:
705 case 2048:
706 arm_cfft_radix4by2_q31 ( p1, L, S->pTwiddle );
707 break;
708 }
709 }
710
711 if ( bitReverseFlag )
712 arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
713 }
714
715 /**
716 @} end of ComplexFFTQ31 group
717 */
718
arm_cfft_radix4by2_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pCoef)719 ARM_DSP_ATTRIBUTE void arm_cfft_radix4by2_q31(
720 q31_t * pSrc,
721 uint32_t fftLen,
722 const q31_t * pCoef)
723 {
724 uint32_t i, l;
725 uint32_t n2;
726 q31_t xt, yt, cosVal, sinVal;
727 q31_t p0, p1;
728
729 n2 = fftLen >> 1U;
730 for (i = 0; i < n2; i++)
731 {
732 cosVal = pCoef[2 * i];
733 sinVal = pCoef[2 * i + 1];
734
735 l = i + n2;
736
737 xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
738 pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
739
740 yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
741 pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
742
743 mult_32x32_keep32_R(p0, xt, cosVal);
744 mult_32x32_keep32_R(p1, yt, cosVal);
745 multAcc_32x32_keep32_R(p0, yt, sinVal);
746 multSub_32x32_keep32_R(p1, xt, sinVal);
747
748 pSrc[2 * l] = p0 << 1;
749 pSrc[2 * l + 1] = p1 << 1;
750 }
751
752
753 /* first col */
754 arm_radix4_butterfly_q31 (pSrc, n2, (q31_t*)pCoef, 2U);
755
756 /* second col */
757 arm_radix4_butterfly_q31 (pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
758
759 n2 = fftLen >> 1U;
760 for (i = 0; i < n2; i++)
761 {
762 p0 = pSrc[4 * i + 0];
763 p1 = pSrc[4 * i + 1];
764 xt = pSrc[4 * i + 2];
765 yt = pSrc[4 * i + 3];
766
767 p0 <<= 1U;
768 p1 <<= 1U;
769 xt <<= 1U;
770 yt <<= 1U;
771
772 pSrc[4 * i + 0] = p0;
773 pSrc[4 * i + 1] = p1;
774 pSrc[4 * i + 2] = xt;
775 pSrc[4 * i + 3] = yt;
776 }
777
778 }
779
arm_cfft_radix4by2_inverse_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pCoef)780 ARM_DSP_ATTRIBUTE void arm_cfft_radix4by2_inverse_q31(
781 q31_t * pSrc,
782 uint32_t fftLen,
783 const q31_t * pCoef)
784 {
785 uint32_t i, l;
786 uint32_t n2;
787 q31_t xt, yt, cosVal, sinVal;
788 q31_t p0, p1;
789
790 n2 = fftLen >> 1U;
791 for (i = 0; i < n2; i++)
792 {
793 cosVal = pCoef[2 * i];
794 sinVal = pCoef[2 * i + 1];
795
796 l = i + n2;
797
798 xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
799 pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
800
801 yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
802 pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
803
804 mult_32x32_keep32_R(p0, xt, cosVal);
805 mult_32x32_keep32_R(p1, yt, cosVal);
806 multSub_32x32_keep32_R(p0, yt, sinVal);
807 multAcc_32x32_keep32_R(p1, xt, sinVal);
808
809 pSrc[2 * l] = p0 << 1U;
810 pSrc[2 * l + 1] = p1 << 1U;
811 }
812
813 /* first col */
814 arm_radix4_butterfly_inverse_q31( pSrc, n2, (q31_t*)pCoef, 2U);
815
816 /* second col */
817 arm_radix4_butterfly_inverse_q31( pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
818
819 n2 = fftLen >> 1U;
820 for (i = 0; i < n2; i++)
821 {
822 p0 = pSrc[4 * i + 0];
823 p1 = pSrc[4 * i + 1];
824 xt = pSrc[4 * i + 2];
825 yt = pSrc[4 * i + 3];
826
827 p0 <<= 1U;
828 p1 <<= 1U;
829 xt <<= 1U;
830 yt <<= 1U;
831
832 pSrc[4 * i + 0] = p0;
833 pSrc[4 * i + 1] = p1;
834 pSrc[4 * i + 2] = xt;
835 pSrc[4 * i + 3] = yt;
836 }
837 }
838 #endif /* defined(ARM_MATH_MVEI) */
839