1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_radix4_q31.c
4  * Description:  This file has function definition of Radix-4 FFT & IFFT function and
5  *               In-place bit reversal using bit reversal table
6  *
7  * $Date:        23 April 2021
8  * $Revision:    V1.9.0
9  *
10  * Target Processor: Cortex-M and Cortex-A cores
11  * -------------------------------------------------------------------- */
12 /*
13  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
14  *
15  * SPDX-License-Identifier: Apache-2.0
16  *
17  * Licensed under the Apache License, Version 2.0 (the License); you may
18  * not use this file except in compliance with the License.
19  * You may obtain a copy of the License at
20  *
21  * www.apache.org/licenses/LICENSE-2.0
22  *
23  * Unless required by applicable law or agreed to in writing, software
24  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26  * See the License for the specific language governing permissions and
27  * limitations under the License.
28  */
29 
30 #include "dsp/transform_functions.h"
31 
32 void arm_radix4_butterfly_inverse_q31(
33         q31_t * pSrc,
34         uint32_t fftLen,
35   const q31_t * pCoef,
36         uint32_t twidCoefModifier);
37 
38 void arm_radix4_butterfly_q31(
39         q31_t * pSrc,
40         uint32_t fftLen,
41   const q31_t * pCoef,
42         uint32_t twidCoefModifier);
43 
44 void arm_bitreversal_q31(
45         q31_t * pSrc,
46         uint32_t fftLen,
47         uint16_t bitRevFactor,
48   const uint16_t * pBitRevTab);
49 
50 /**
51   @ingroup groupTransforms
52  */
53 
54 /**
55   @addtogroup ComplexFFT
56   @{
57  */
58 
59 /**
60   @brief         Processing function for the Q31 CFFT/CIFFT.
61   @deprecated    Do not use this function.  It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
62   @param[in]     S    points to an instance of the Q31 CFFT/CIFFT structure
63   @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
64   @return        none
65 
66   @par Input and output formats:
67                  Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
68                  Hence the output format is different for different FFT sizes.
69                  The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
70   @par
71                  \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"
72                  \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"
73  */
74 
arm_cfft_radix4_q31(const arm_cfft_radix4_instance_q31 * S,q31_t * pSrc)75 void arm_cfft_radix4_q31(
76   const arm_cfft_radix4_instance_q31 * S,
77         q31_t * pSrc)
78 {
79   if (S->ifftFlag == 1U)
80   {
81     /* Complex IFFT radix-4 */
82     arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
83   }
84   else
85   {
86     /* Complex FFT radix-4 */
87     arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
88   }
89 
90   if (S->bitReverseFlag == 1U)
91   {
92     /*  Bit Reversal */
93     arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
94   }
95 
96 }
97 
98 /**
99   @} end of ComplexFFT group
100  */
101 
102 /*
103  * Radix-4 FFT algorithm used is :
104  *
105  * Input real and imaginary data:
106  * x(n) = xa + j * ya
107  * x(n+N/4 ) = xb + j * yb
108  * x(n+N/2 ) = xc + j * yc
109  * x(n+3N 4) = xd + j * yd
110  *
111  *
112  * Output real and imaginary data:
113  * x(4r) = xa'+ j * ya'
114  * x(4r+1) = xb'+ j * yb'
115  * x(4r+2) = xc'+ j * yc'
116  * x(4r+3) = xd'+ j * yd'
117  *
118  *
119  * Twiddle factors for radix-4 FFT:
120  * Wn = co1 + j * (- si1)
121  * W2n = co2 + j * (- si2)
122  * W3n = co3 + j * (- si3)
123  *
124  *  Butterfly implementation:
125  * xa' = xa + xb + xc + xd
126  * ya' = ya + yb + yc + yd
127  * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
128  * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
129  * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
130  * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
131  * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
132  * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
133  *
134  */
135 
136 /**
137   @brief         Core function for the Q31 CFFT butterfly process.
138   @param[in,out] pSrc             points to the in-place buffer of Q31 data type.
139   @param[in]     fftLen           length of the FFT.
140   @param[in]     pCoef            points to twiddle coefficient buffer.
141   @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
142   @return        none
143  */
144 
arm_radix4_butterfly_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pCoef,uint32_t twidCoefModifier)145 void arm_radix4_butterfly_q31(
146         q31_t * pSrc,
147         uint32_t fftLen,
148   const q31_t * pCoef,
149         uint32_t twidCoefModifier)
150 {
151         uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
152         q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
153 
154         q31_t xa, xb, xc, xd;
155         q31_t ya, yb, yc, yd;
156         q31_t xa_out, xb_out, xc_out, xd_out;
157         q31_t ya_out, yb_out, yc_out, yd_out;
158 
159         q31_t *ptr1;
160 
161   /* Total process is divided into three stages */
162 
163   /* process first stage, middle stages, & last stage */
164 
165 
166   /* start of first stage process */
167 
168   /*  Initializations for the first stage */
169   n2 = fftLen;
170   n1 = n2;
171   /* n2 = fftLen/4 */
172   n2 >>= 2U;
173   i0 = 0U;
174   ia1 = 0U;
175 
176   j = n2;
177 
178   /*  Calculation of first stage */
179   do
180   {
181     /*  index calculation for the input as, */
182     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
183     i1 = i0 + n2;
184     i2 = i1 + n2;
185     i3 = i2 + n2;
186 
187     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
188 
189     /*  Butterfly implementation */
190     /* xa + xc */
191     r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U);
192     /* xa - xc */
193     r2 = (pSrc[(2U * i0)] >> 4U) - (pSrc[(2U * i2)] >> 4U);
194 
195     /* xb + xd */
196     t1 = (pSrc[(2U * i1)] >> 4U) + (pSrc[(2U * i3)] >> 4U);
197 
198     /* ya + yc */
199     s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
200     /* ya - yc */
201     s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
202 
203     /* xa' = xa + xb + xc + xd */
204     pSrc[2U * i0] = (r1 + t1);
205     /* (xa + xc) - (xb + xd) */
206     r1 = r1 - t1;
207     /* yb + yd */
208     t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
209 
210     /* ya' = ya + yb + yc + yd */
211     pSrc[(2U * i0) + 1U] = (s1 + t2);
212 
213     /* (ya + yc) - (yb + yd) */
214     s1 = s1 - t2;
215 
216     /* yb - yd */
217     t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
218     /* xb - xd */
219     t2 = (pSrc[(2U * i1)] >> 4U) - (pSrc[(2U * i3)] >> 4U);
220 
221     /*  index calculation for the coefficients */
222     ia2 = 2U * ia1;
223     co2 = pCoef[(ia2 * 2U)];
224     si2 = pCoef[(ia2 * 2U) + 1U];
225 
226     /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
227     pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
228                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
229 
230     /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
231     pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
232                             ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
233 
234     /* (xa - xc) + (yb - yd) */
235     r1 = r2 + t1;
236     /* (xa - xc) - (yb - yd) */
237     r2 = r2 - t1;
238 
239     /* (ya - yc) - (xb - xd) */
240     s1 = s2 - t2;
241     /* (ya - yc) + (xb - xd) */
242     s2 = s2 + t2;
243 
244     co1 = pCoef[(ia1 * 2U)];
245     si1 = pCoef[(ia1 * 2U) + 1U];
246 
247     /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
248     pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
249                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
250 
251     /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
252     pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
253                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
254 
255     /*  index calculation for the coefficients */
256     ia3 = 3U * ia1;
257     co3 = pCoef[(ia3 * 2U)];
258     si3 = pCoef[(ia3 * 2U) + 1U];
259 
260     /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
261     pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
262                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
263 
264     /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
265     pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
266                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
267 
268     /*  Twiddle coefficients index modifier */
269     ia1 = ia1 + twidCoefModifier;
270 
271     /*  Updating input index */
272     i0 = i0 + 1U;
273 
274   } while (--j);
275 
276   /* end of first stage process */
277 
278   /* data is in 5.27(q27) format */
279 
280 
281   /* start of Middle stages process */
282 
283 
284   /* each stage in middle stages provides two down scaling of the input */
285 
286   twidCoefModifier <<= 2U;
287 
288 
289   for (k = fftLen / 4U; k > 4U; k >>= 2U)
290   {
291     /*  Initializations for the first stage */
292     n1 = n2;
293     n2 >>= 2U;
294     ia1 = 0U;
295 
296     /*  Calculation of first stage */
297     for (j = 0U; j <= (n2 - 1U); j++)
298     {
299       /*  index calculation for the coefficients */
300       ia2 = ia1 + ia1;
301       ia3 = ia2 + ia1;
302       co1 = pCoef[(ia1 * 2U)];
303       si1 = pCoef[(ia1 * 2U) + 1U];
304       co2 = pCoef[(ia2 * 2U)];
305       si2 = pCoef[(ia2 * 2U) + 1U];
306       co3 = pCoef[(ia3 * 2U)];
307       si3 = pCoef[(ia3 * 2U) + 1U];
308       /*  Twiddle coefficients index modifier */
309       ia1 = ia1 + twidCoefModifier;
310 
311       for (i0 = j; i0 < fftLen; i0 += n1)
312       {
313         /*  index calculation for the input as, */
314         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
315         i1 = i0 + n2;
316         i2 = i1 + n2;
317         i3 = i2 + n2;
318 
319         /*  Butterfly implementation */
320         /* xa + xc */
321         r1 = pSrc[2U * i0] + pSrc[2U * i2];
322         /* xa - xc */
323         r2 = pSrc[2U * i0] - pSrc[2U * i2];
324 
325         /* ya + yc */
326         s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
327         /* ya - yc */
328         s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
329 
330         /* xb + xd */
331         t1 = pSrc[2U * i1] + pSrc[2U * i3];
332 
333         /* xa' = xa + xb + xc + xd */
334         pSrc[2U * i0] = (r1 + t1) >> 2U;
335         /* xa + xc -(xb + xd) */
336         r1 = r1 - t1;
337 
338         /* yb + yd */
339         t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
340         /* ya' = ya + yb + yc + yd */
341         pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
342 
343         /* (ya + yc) - (yb + yd) */
344         s1 = s1 - t2;
345 
346         /* (yb - yd) */
347         t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
348         /* (xb - xd) */
349         t2 = pSrc[2U * i1] - pSrc[2U * i3];
350 
351         /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
352         pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
353                          ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
354 
355         /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
356         pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
357                                 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
358 
359         /* (xa - xc) + (yb - yd) */
360         r1 = r2 + t1;
361         /* (xa - xc) - (yb - yd) */
362         r2 = r2 - t1;
363 
364         /* (ya - yc) -  (xb - xd) */
365         s1 = s2 - t2;
366         /* (ya - yc) +  (xb - xd) */
367         s2 = s2 + t2;
368 
369         /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
370         pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
371                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
372 
373         /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
374         pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
375                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
376 
377         /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
378         pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
379                          ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
380 
381         /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
382         pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
383                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
384       }
385     }
386     twidCoefModifier <<= 2U;
387   }
388 
389   /* End of Middle stages process */
390 
391   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
392   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
393   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
394   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
395 
396 
397   /* start of Last stage process */
398   /*  Initializations for the last stage */
399   j = fftLen >> 2;
400   ptr1 = &pSrc[0];
401 
402   /*  Calculations of last stage */
403   do
404   {
405     /* Read xa (real), ya(imag) input */
406     xa = *ptr1++;
407     ya = *ptr1++;
408 
409     /* Read xb (real), yb(imag) input */
410     xb = *ptr1++;
411     yb = *ptr1++;
412 
413     /* Read xc (real), yc(imag) input */
414     xc = *ptr1++;
415     yc = *ptr1++;
416 
417     /* Read xc (real), yc(imag) input */
418     xd = *ptr1++;
419     yd = *ptr1++;
420 
421     /* xa' = xa + xb + xc + xd */
422     xa_out = xa + xb + xc + xd;
423 
424     /* ya' = ya + yb + yc + yd */
425     ya_out = ya + yb + yc + yd;
426 
427     /* pointer updation for writing */
428     ptr1 = ptr1 - 8U;
429 
430     /* writing xa' and ya' */
431     *ptr1++ = xa_out;
432     *ptr1++ = ya_out;
433 
434     xc_out = (xa - xb + xc - xd);
435     yc_out = (ya - yb + yc - yd);
436 
437     /* writing xc' and yc' */
438     *ptr1++ = xc_out;
439     *ptr1++ = yc_out;
440 
441     xb_out = (xa + yb - xc - yd);
442     yb_out = (ya - xb - yc + xd);
443 
444     /* writing xb' and yb' */
445     *ptr1++ = xb_out;
446     *ptr1++ = yb_out;
447 
448     xd_out = (xa - yb - xc + yd);
449     yd_out = (ya + xb - yc - xd);
450 
451     /* writing xd' and yd' */
452     *ptr1++ = xd_out;
453     *ptr1++ = yd_out;
454 
455 
456   } while (--j);
457 
458   /* output is in 11.21(q21) format for the 1024 point */
459   /* output is in 9.23(q23) format for the 256 point */
460   /* output is in 7.25(q25) format for the 64 point */
461   /* output is in 5.27(q27) format for the 16 point */
462 
463   /* End of last stage process */
464 
465 }
466 
467 
468 /**
469   @brief         Core function for the Q31 CIFFT butterfly process.
470   @param[in,out] pSrc             points to the in-place buffer of Q31 data type.
471   @param[in]     fftLen           length of the FFT.
472   @param[in]     pCoef            points to twiddle coefficient buffer.
473   @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
474   @return        none
475  */
476 
477 /*
478  * Radix-4 IFFT algorithm used is :
479  *
480  * CIFFT uses same twiddle coefficients as CFFT Function
481  *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
482  *
483  *
484  * IFFT is implemented with following changes in equations from FFT
485  *
486  * Input real and imaginary data:
487  * x(n) = xa + j * ya
488  * x(n+N/4 ) = xb + j * yb
489  * x(n+N/2 ) = xc + j * yc
490  * x(n+3N 4) = xd + j * yd
491  *
492  *
493  * Output real and imaginary data:
494  * x(4r) = xa'+ j * ya'
495  * x(4r+1) = xb'+ j * yb'
496  * x(4r+2) = xc'+ j * yc'
497  * x(4r+3) = xd'+ j * yd'
498  *
499  *
500  * Twiddle factors for radix-4 IFFT:
501  * Wn = co1 + j * (si1)
502  * W2n = co2 + j * (si2)
503  * W3n = co3 + j * (si3)
504 
505  * The real and imaginary output values for the radix-4 butterfly are
506  * xa' = xa + xb + xc + xd
507  * ya' = ya + yb + yc + yd
508  * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
509  * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
510  * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
511  * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
512  * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
513  * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
514  *
515  */
516 
arm_radix4_butterfly_inverse_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pCoef,uint32_t twidCoefModifier)517 void arm_radix4_butterfly_inverse_q31(
518         q31_t * pSrc,
519         uint32_t fftLen,
520   const q31_t * pCoef,
521         uint32_t twidCoefModifier)
522 {
523         uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
524         q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
525         q31_t xa, xb, xc, xd;
526         q31_t ya, yb, yc, yd;
527         q31_t xa_out, xb_out, xc_out, xd_out;
528         q31_t ya_out, yb_out, yc_out, yd_out;
529 
530         q31_t *ptr1;
531 
532   /* input is be 1.31(q31) format for all FFT sizes */
533   /* Total process is divided into three stages */
534   /* process first stage, middle stages, & last stage */
535 
536   /* Start of first stage process */
537 
538   /* Initializations for the first stage */
539   n2 = fftLen;
540   n1 = n2;
541   /* n2 = fftLen/4 */
542   n2 >>= 2U;
543   i0 = 0U;
544   ia1 = 0U;
545 
546   j = n2;
547 
548   do
549   {
550     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
551 
552     /*  index calculation for the input as, */
553     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
554     i1 = i0 + n2;
555     i2 = i1 + n2;
556     i3 = i2 + n2;
557 
558     /*  Butterfly implementation */
559     /* xa + xc */
560     r1 = (pSrc[2U * i0] >> 4U) + (pSrc[2U * i2] >> 4U);
561     /* xa - xc */
562     r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
563 
564     /* xb + xd */
565     t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
566 
567     /* ya + yc */
568     s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
569     /* ya - yc */
570     s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
571 
572     /* xa' = xa + xb + xc + xd */
573     pSrc[2U * i0] = (r1 + t1);
574     /* (xa + xc) - (xb + xd) */
575     r1 = r1 - t1;
576     /* yb + yd */
577     t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
578     /* ya' = ya + yb + yc + yd */
579     pSrc[(2U * i0) + 1U] = (s1 + t2);
580 
581     /* (ya + yc) - (yb + yd) */
582     s1 = s1 - t2;
583 
584     /* yb - yd */
585     t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
586     /* xb - xd */
587     t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
588 
589     /*  index calculation for the coefficients */
590     ia2 = 2U * ia1;
591     co2 = pCoef[ia2 * 2U];
592     si2 = pCoef[(ia2 * 2U) + 1U];
593 
594     /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
595     pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
596                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
597 
598     /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
599     pSrc[2U * i1 + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
600                           ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
601 
602     /* (xa - xc) - (yb - yd) */
603     r1 = r2 - t1;
604     /* (xa - xc) + (yb - yd) */
605     r2 = r2 + t1;
606 
607     /* (ya - yc) + (xb - xd) */
608     s1 = s2 + t2;
609     /* (ya - yc) - (xb - xd) */
610     s2 = s2 - t2;
611 
612     co1 = pCoef[ia1 * 2U];
613     si1 = pCoef[(ia1 * 2U) + 1U];
614 
615     /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
616     pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
617                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
618 
619     /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
620     pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
621                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
622 
623     /*  index calculation for the coefficients */
624     ia3 = 3U * ia1;
625     co3 = pCoef[ia3 * 2U];
626     si3 = pCoef[(ia3 * 2U) + 1U];
627 
628     /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
629     pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
630                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
631 
632     /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
633     pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
634                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
635 
636     /*  Twiddle coefficients index modifier */
637     ia1 = ia1 + twidCoefModifier;
638 
639     /*  Updating input index */
640     i0 = i0 + 1U;
641 
642   } while (--j);
643 
644   /* data is in 5.27(q27) format */
645   /* each stage provides two down scaling of the input */
646 
647 
648   /* Start of Middle stages process */
649 
650   twidCoefModifier <<= 2U;
651 
652   /*  Calculation of second stage to excluding last stage */
653   for (k = fftLen / 4U; k > 4U; k >>= 2U)
654   {
655     /*  Initializations for the first stage */
656     n1 = n2;
657     n2 >>= 2U;
658     ia1 = 0U;
659 
660     for (j = 0; j <= (n2 - 1U); j++)
661     {
662       /*  index calculation for the coefficients */
663       ia2 = ia1 + ia1;
664       ia3 = ia2 + ia1;
665       co1 = pCoef[(ia1 * 2U)];
666       si1 = pCoef[(ia1 * 2U) + 1U];
667       co2 = pCoef[(ia2 * 2U)];
668       si2 = pCoef[(ia2 * 2U) + 1U];
669       co3 = pCoef[(ia3 * 2U)];
670       si3 = pCoef[(ia3 * 2U) + 1U];
671       /*  Twiddle coefficients index modifier */
672       ia1 = ia1 + twidCoefModifier;
673 
674       for (i0 = j; i0 < fftLen; i0 += n1)
675       {
676         /*  index calculation for the input as, */
677         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
678         i1 = i0 + n2;
679         i2 = i1 + n2;
680         i3 = i2 + n2;
681 
682         /*  Butterfly implementation */
683         /* xa + xc */
684         r1 = pSrc[2U * i0] + pSrc[2U * i2];
685         /* xa - xc */
686         r2 = pSrc[2U * i0] - pSrc[2U * i2];
687 
688         /* ya + yc */
689         s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
690         /* ya - yc */
691         s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
692 
693         /* xb + xd */
694         t1 = pSrc[2U * i1] + pSrc[2U * i3];
695 
696         /* xa' = xa + xb + xc + xd */
697         pSrc[2U * i0] = (r1 + t1) >> 2U;
698         /* xa + xc -(xb + xd) */
699         r1 = r1 - t1;
700         /* yb + yd */
701         t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
702         /* ya' = ya + yb + yc + yd */
703         pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
704 
705         /* (ya + yc) - (yb + yd) */
706         s1 = s1 - t2;
707 
708         /* (yb - yd) */
709         t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
710         /* (xb - xd) */
711         t2 = pSrc[2U * i1] - pSrc[2U * i3];
712 
713         /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
714         pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
715                          ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
716 
717         /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
718         pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
719                                 ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
720 
721         /* (xa - xc) - (yb - yd) */
722         r1 = r2 - t1;
723         /* (xa - xc) + (yb - yd) */
724         r2 = r2 + t1;
725 
726         /* (ya - yc) +  (xb - xd) */
727         s1 = s2 + t2;
728         /* (ya - yc) -  (xb - xd) */
729         s2 = s2 - t2;
730 
731         /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
732         pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
733                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
734 
735         /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
736         pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
737                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
738 
739         /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
740         pSrc[(2U * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
741                            ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
742 
743         /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
744         pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
745                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
746       }
747     }
748     twidCoefModifier <<= 2U;
749   }
750 
751   /* End of Middle stages process */
752 
753   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
754   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
755   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
756   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
757 
758 
759   /* Start of last stage process */
760 
761 
762   /*  Initializations for the last stage */
763   j = fftLen >> 2;
764   ptr1 = &pSrc[0];
765 
766   /*  Calculations of last stage */
767   do
768   {
769     /* Read xa (real), ya(imag) input */
770     xa = *ptr1++;
771     ya = *ptr1++;
772 
773     /* Read xb (real), yb(imag) input */
774     xb = *ptr1++;
775     yb = *ptr1++;
776 
777     /* Read xc (real), yc(imag) input */
778     xc = *ptr1++;
779     yc = *ptr1++;
780 
781     /* Read xc (real), yc(imag) input */
782     xd = *ptr1++;
783     yd = *ptr1++;
784 
785     /* xa' = xa + xb + xc + xd */
786     xa_out = xa + xb + xc + xd;
787 
788     /* ya' = ya + yb + yc + yd */
789     ya_out = ya + yb + yc + yd;
790 
791     /* pointer updation for writing */
792     ptr1 = ptr1 - 8U;
793 
794     /* writing xa' and ya' */
795     *ptr1++ = xa_out;
796     *ptr1++ = ya_out;
797 
798     xc_out = (xa - xb + xc - xd);
799     yc_out = (ya - yb + yc - yd);
800 
801     /* writing xc' and yc' */
802     *ptr1++ = xc_out;
803     *ptr1++ = yc_out;
804 
805     xb_out = (xa - yb - xc + yd);
806     yb_out = (ya + xb - yc - xd);
807 
808     /* writing xb' and yb' */
809     *ptr1++ = xb_out;
810     *ptr1++ = yb_out;
811 
812     xd_out = (xa + yb - xc - yd);
813     yd_out = (ya - xb - yc + xd);
814 
815     /* writing xd' and yd' */
816     *ptr1++ = xd_out;
817     *ptr1++ = yd_out;
818 
819   } while (--j);
820 
821   /* output is in 11.21(q21) format for the 1024 point */
822   /* output is in 9.23(q23) format for the 256 point */
823   /* output is in 7.25(q25) format for the 64 point */
824   /* output is in 5.27(q27) format for the 16 point */
825 
826   /* End of last stage process */
827 }
828