1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_radix4_q31.c
4  * Description:  This file has function definition of Radix-4 FFT & IFFT function and
5  *               In-place bit reversal using bit reversal table
6  *
7  * $Date:        23 April 2021
8  * $Revision:    V1.9.0
9  *
10  * Target Processor: Cortex-M and Cortex-A cores
11  * -------------------------------------------------------------------- */
12 /*
13  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
14  *
15  * SPDX-License-Identifier: Apache-2.0
16  *
17  * Licensed under the Apache License, Version 2.0 (the License); you may
18  * not use this file except in compliance with the License.
19  * You may obtain a copy of the License at
20  *
21  * www.apache.org/licenses/LICENSE-2.0
22  *
23  * Unless required by applicable law or agreed to in writing, software
24  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26  * See the License for the specific language governing permissions and
27  * limitations under the License.
28  */
29 
30 #include "dsp/transform_functions.h"
31 
32 void arm_radix4_butterfly_inverse_q31(
33         q31_t * pSrc,
34         uint32_t fftLen,
35   const q31_t * pCoef,
36         uint32_t twidCoefModifier);
37 
38 void arm_radix4_butterfly_q31(
39         q31_t * pSrc,
40         uint32_t fftLen,
41   const q31_t * pCoef,
42         uint32_t twidCoefModifier);
43 
44 void arm_bitreversal_q31(
45         q31_t * pSrc,
46         uint32_t fftLen,
47         uint16_t bitRevFactor,
48   const uint16_t * pBitRevTab);
49 
50 
51 /**
52   @addtogroup ComplexFFTDeprecated
53   @{
54  */
55 
56 /**
57   @brief         Processing function for the Q31 CFFT/CIFFT.
58   @deprecated    Do not use this function.  It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
59   @param[in]     S    points to an instance of the Q31 CFFT/CIFFT structure
60   @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
61   @return        none
62 
63   @par Input and output formats:
64                  Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
65                  Hence the output format is different for different FFT sizes.
66                  The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
67   @par
68 
69 | CFFT Size | Input format  | Output format | Number of bits to upscale |
70 | --------: | ------------: | ------------: | ------------------------: |
71 | 16        | 1.31          | 5.27          | 4                         |
72 | 64        | 1.31          | 7.25          | 6                         |
73 | 256       | 1.31          | 9.23          | 8                         |
74 | 1024      | 1.31          | 11.21         | 10                        |
75 
76 | CIFFT Size | Input format  | Output format | Number of bits to upscale |
77 | ---------: | ------------: | ------------: | ------------------------: |
78 | 16         | 1.31          | 5.27          | 0                         |
79 | 64         | 1.31          | 7.25          | 0                         |
80 | 256        | 1.31          | 9.23          | 0                         |
81 | 1024       | 1.31          | 11.21         | 0                         |
82 
83  */
84 
arm_cfft_radix4_q31(const arm_cfft_radix4_instance_q31 * S,q31_t * pSrc)85 void arm_cfft_radix4_q31(
86   const arm_cfft_radix4_instance_q31 * S,
87         q31_t * pSrc)
88 {
89   if (S->ifftFlag == 1U)
90   {
91     /* Complex IFFT radix-4 */
92     arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
93   }
94   else
95   {
96     /* Complex FFT radix-4 */
97     arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
98   }
99 
100   if (S->bitReverseFlag == 1U)
101   {
102     /*  Bit Reversal */
103     arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
104   }
105 
106 }
107 
108 /**
109   @} end of ComplexFFTDeprecated group
110  */
111 
112 /*
113  * Radix-4 FFT algorithm used is :
114  *
115  * Input real and imaginary data:
116  * x(n) = xa + j * ya
117  * x(n+N/4 ) = xb + j * yb
118  * x(n+N/2 ) = xc + j * yc
119  * x(n+3N 4) = xd + j * yd
120  *
121  *
122  * Output real and imaginary data:
123  * x(4r) = xa'+ j * ya'
124  * x(4r+1) = xb'+ j * yb'
125  * x(4r+2) = xc'+ j * yc'
126  * x(4r+3) = xd'+ j * yd'
127  *
128  *
129  * Twiddle factors for radix-4 FFT:
130  * Wn = co1 + j * (- si1)
131  * W2n = co2 + j * (- si2)
132  * W3n = co3 + j * (- si3)
133  *
134  *  Butterfly implementation:
135  * xa' = xa + xb + xc + xd
136  * ya' = ya + yb + yc + yd
137  * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
138  * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
139  * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
140  * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
141  * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
142  * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
143  *
144  */
145 
146 /**
147   @brief         Core function for the Q31 CFFT butterfly process.
148   @param[in,out] pSrc             points to the in-place buffer of Q31 data type.
149   @param[in]     fftLen           length of the FFT.
150   @param[in]     pCoef            points to twiddle coefficient buffer.
151   @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
152   @return        none
153  */
154 
arm_radix4_butterfly_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pCoef,uint32_t twidCoefModifier)155 void arm_radix4_butterfly_q31(
156         q31_t * pSrc,
157         uint32_t fftLen,
158   const q31_t * pCoef,
159         uint32_t twidCoefModifier)
160 {
161         uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
162         q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
163 
164         q31_t xa, xb, xc, xd;
165         q31_t ya, yb, yc, yd;
166         q31_t xa_out, xb_out, xc_out, xd_out;
167         q31_t ya_out, yb_out, yc_out, yd_out;
168 
169         q31_t *ptr1;
170 
171   /* Total process is divided into three stages */
172 
173   /* process first stage, middle stages, & last stage */
174 
175 
176   /* start of first stage process */
177 
178   /*  Initializations for the first stage */
179   n2 = fftLen;
180   n1 = n2;
181   /* n2 = fftLen/4 */
182   n2 >>= 2U;
183   i0 = 0U;
184   ia1 = 0U;
185 
186   j = n2;
187 
188   /*  Calculation of first stage */
189   do
190   {
191     /*  index calculation for the input as, */
192     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
193     i1 = i0 + n2;
194     i2 = i1 + n2;
195     i3 = i2 + n2;
196 
197     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
198 
199     /*  Butterfly implementation */
200     /* xa + xc */
201     r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U);
202     /* xa - xc */
203     r2 = (pSrc[(2U * i0)] >> 4U) - (pSrc[(2U * i2)] >> 4U);
204 
205     /* xb + xd */
206     t1 = (pSrc[(2U * i1)] >> 4U) + (pSrc[(2U * i3)] >> 4U);
207 
208     /* ya + yc */
209     s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
210     /* ya - yc */
211     s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
212 
213     /* xa' = xa + xb + xc + xd */
214     pSrc[2U * i0] = (r1 + t1);
215     /* (xa + xc) - (xb + xd) */
216     r1 = r1 - t1;
217     /* yb + yd */
218     t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
219 
220     /* ya' = ya + yb + yc + yd */
221     pSrc[(2U * i0) + 1U] = (s1 + t2);
222 
223     /* (ya + yc) - (yb + yd) */
224     s1 = s1 - t2;
225 
226     /* yb - yd */
227     t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
228     /* xb - xd */
229     t2 = (pSrc[(2U * i1)] >> 4U) - (pSrc[(2U * i3)] >> 4U);
230 
231     /*  index calculation for the coefficients */
232     ia2 = 2U * ia1;
233     co2 = pCoef[(ia2 * 2U)];
234     si2 = pCoef[(ia2 * 2U) + 1U];
235 
236     /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
237     pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
238                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
239 
240     /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
241     pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
242                             ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
243 
244     /* (xa - xc) + (yb - yd) */
245     r1 = r2 + t1;
246     /* (xa - xc) - (yb - yd) */
247     r2 = r2 - t1;
248 
249     /* (ya - yc) - (xb - xd) */
250     s1 = s2 - t2;
251     /* (ya - yc) + (xb - xd) */
252     s2 = s2 + t2;
253 
254     co1 = pCoef[(ia1 * 2U)];
255     si1 = pCoef[(ia1 * 2U) + 1U];
256 
257     /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
258     pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
259                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
260 
261     /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
262     pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
263                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
264 
265     /*  index calculation for the coefficients */
266     ia3 = 3U * ia1;
267     co3 = pCoef[(ia3 * 2U)];
268     si3 = pCoef[(ia3 * 2U) + 1U];
269 
270     /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
271     pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
272                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
273 
274     /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
275     pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
276                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
277 
278     /*  Twiddle coefficients index modifier */
279     ia1 = ia1 + twidCoefModifier;
280 
281     /*  Updating input index */
282     i0 = i0 + 1U;
283 
284   } while (--j);
285 
286   /* end of first stage process */
287 
288   /* data is in 5.27(q27) format */
289 
290 
291   /* start of Middle stages process */
292 
293 
294   /* each stage in middle stages provides two down scaling of the input */
295 
296   twidCoefModifier <<= 2U;
297 
298 
299   for (k = fftLen / 4U; k > 4U; k >>= 2U)
300   {
301     /*  Initializations for the first stage */
302     n1 = n2;
303     n2 >>= 2U;
304     ia1 = 0U;
305 
306     /*  Calculation of first stage */
307     for (j = 0U; j <= (n2 - 1U); j++)
308     {
309       /*  index calculation for the coefficients */
310       ia2 = ia1 + ia1;
311       ia3 = ia2 + ia1;
312       co1 = pCoef[(ia1 * 2U)];
313       si1 = pCoef[(ia1 * 2U) + 1U];
314       co2 = pCoef[(ia2 * 2U)];
315       si2 = pCoef[(ia2 * 2U) + 1U];
316       co3 = pCoef[(ia3 * 2U)];
317       si3 = pCoef[(ia3 * 2U) + 1U];
318       /*  Twiddle coefficients index modifier */
319       ia1 = ia1 + twidCoefModifier;
320 
321       for (i0 = j; i0 < fftLen; i0 += n1)
322       {
323         /*  index calculation for the input as, */
324         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
325         i1 = i0 + n2;
326         i2 = i1 + n2;
327         i3 = i2 + n2;
328 
329         /*  Butterfly implementation */
330         /* xa + xc */
331         r1 = pSrc[2U * i0] + pSrc[2U * i2];
332         /* xa - xc */
333         r2 = pSrc[2U * i0] - pSrc[2U * i2];
334 
335         /* ya + yc */
336         s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
337         /* ya - yc */
338         s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
339 
340         /* xb + xd */
341         t1 = pSrc[2U * i1] + pSrc[2U * i3];
342 
343         /* xa' = xa + xb + xc + xd */
344         pSrc[2U * i0] = (r1 + t1) >> 2U;
345         /* xa + xc -(xb + xd) */
346         r1 = r1 - t1;
347 
348         /* yb + yd */
349         t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
350         /* ya' = ya + yb + yc + yd */
351         pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
352 
353         /* (ya + yc) - (yb + yd) */
354         s1 = s1 - t2;
355 
356         /* (yb - yd) */
357         t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
358         /* (xb - xd) */
359         t2 = pSrc[2U * i1] - pSrc[2U * i3];
360 
361         /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
362         pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
363                          ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
364 
365         /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
366         pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
367                                 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
368 
369         /* (xa - xc) + (yb - yd) */
370         r1 = r2 + t1;
371         /* (xa - xc) - (yb - yd) */
372         r2 = r2 - t1;
373 
374         /* (ya - yc) -  (xb - xd) */
375         s1 = s2 - t2;
376         /* (ya - yc) +  (xb - xd) */
377         s2 = s2 + t2;
378 
379         /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
380         pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
381                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
382 
383         /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
384         pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
385                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
386 
387         /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
388         pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
389                          ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
390 
391         /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
392         pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
393                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
394       }
395     }
396     twidCoefModifier <<= 2U;
397   }
398 
399   /* End of Middle stages process */
400 
401   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
402   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
403   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
404   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
405 
406 
407   /* start of Last stage process */
408   /*  Initializations for the last stage */
409   j = fftLen >> 2;
410   ptr1 = &pSrc[0];
411 
412   /*  Calculations of last stage */
413   do
414   {
415     /* Read xa (real), ya(imag) input */
416     xa = *ptr1++;
417     ya = *ptr1++;
418 
419     /* Read xb (real), yb(imag) input */
420     xb = *ptr1++;
421     yb = *ptr1++;
422 
423     /* Read xc (real), yc(imag) input */
424     xc = *ptr1++;
425     yc = *ptr1++;
426 
427     /* Read xc (real), yc(imag) input */
428     xd = *ptr1++;
429     yd = *ptr1++;
430 
431     /* xa' = xa + xb + xc + xd */
432     xa_out = xa + xb + xc + xd;
433 
434     /* ya' = ya + yb + yc + yd */
435     ya_out = ya + yb + yc + yd;
436 
437     /* pointer updation for writing */
438     ptr1 = ptr1 - 8U;
439 
440     /* writing xa' and ya' */
441     *ptr1++ = xa_out;
442     *ptr1++ = ya_out;
443 
444     xc_out = (xa - xb + xc - xd);
445     yc_out = (ya - yb + yc - yd);
446 
447     /* writing xc' and yc' */
448     *ptr1++ = xc_out;
449     *ptr1++ = yc_out;
450 
451     xb_out = (xa + yb - xc - yd);
452     yb_out = (ya - xb - yc + xd);
453 
454     /* writing xb' and yb' */
455     *ptr1++ = xb_out;
456     *ptr1++ = yb_out;
457 
458     xd_out = (xa - yb - xc + yd);
459     yd_out = (ya + xb - yc - xd);
460 
461     /* writing xd' and yd' */
462     *ptr1++ = xd_out;
463     *ptr1++ = yd_out;
464 
465 
466   } while (--j);
467 
468   /* output is in 11.21(q21) format for the 1024 point */
469   /* output is in 9.23(q23) format for the 256 point */
470   /* output is in 7.25(q25) format for the 64 point */
471   /* output is in 5.27(q27) format for the 16 point */
472 
473   /* End of last stage process */
474 
475 }
476 
477 
478 /**
479   @brief         Core function for the Q31 CIFFT butterfly process.
480   @param[in,out] pSrc             points to the in-place buffer of Q31 data type.
481   @param[in]     fftLen           length of the FFT.
482   @param[in]     pCoef            points to twiddle coefficient buffer.
483   @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
484   @return        none
485  */
486 
487 /*
488  * Radix-4 IFFT algorithm used is :
489  *
490  * CIFFT uses same twiddle coefficients as CFFT Function
491  *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
492  *
493  *
494  * IFFT is implemented with following changes in equations from FFT
495  *
496  * Input real and imaginary data:
497  * x(n) = xa + j * ya
498  * x(n+N/4 ) = xb + j * yb
499  * x(n+N/2 ) = xc + j * yc
500  * x(n+3N 4) = xd + j * yd
501  *
502  *
503  * Output real and imaginary data:
504  * x(4r) = xa'+ j * ya'
505  * x(4r+1) = xb'+ j * yb'
506  * x(4r+2) = xc'+ j * yc'
507  * x(4r+3) = xd'+ j * yd'
508  *
509  *
510  * Twiddle factors for radix-4 IFFT:
511  * Wn = co1 + j * (si1)
512  * W2n = co2 + j * (si2)
513  * W3n = co3 + j * (si3)
514 
515  * The real and imaginary output values for the radix-4 butterfly are
516  * xa' = xa + xb + xc + xd
517  * ya' = ya + yb + yc + yd
518  * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
519  * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
520  * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
521  * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
522  * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
523  * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
524  *
525  */
526 
arm_radix4_butterfly_inverse_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pCoef,uint32_t twidCoefModifier)527 void arm_radix4_butterfly_inverse_q31(
528         q31_t * pSrc,
529         uint32_t fftLen,
530   const q31_t * pCoef,
531         uint32_t twidCoefModifier)
532 {
533         uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
534         q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
535         q31_t xa, xb, xc, xd;
536         q31_t ya, yb, yc, yd;
537         q31_t xa_out, xb_out, xc_out, xd_out;
538         q31_t ya_out, yb_out, yc_out, yd_out;
539 
540         q31_t *ptr1;
541 
542   /* input is be 1.31(q31) format for all FFT sizes */
543   /* Total process is divided into three stages */
544   /* process first stage, middle stages, & last stage */
545 
546   /* Start of first stage process */
547 
548   /* Initializations for the first stage */
549   n2 = fftLen;
550   n1 = n2;
551   /* n2 = fftLen/4 */
552   n2 >>= 2U;
553   i0 = 0U;
554   ia1 = 0U;
555 
556   j = n2;
557 
558   do
559   {
560     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
561 
562     /*  index calculation for the input as, */
563     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
564     i1 = i0 + n2;
565     i2 = i1 + n2;
566     i3 = i2 + n2;
567 
568     /*  Butterfly implementation */
569     /* xa + xc */
570     r1 = (pSrc[2U * i0] >> 4U) + (pSrc[2U * i2] >> 4U);
571     /* xa - xc */
572     r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
573 
574     /* xb + xd */
575     t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
576 
577     /* ya + yc */
578     s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
579     /* ya - yc */
580     s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
581 
582     /* xa' = xa + xb + xc + xd */
583     pSrc[2U * i0] = (r1 + t1);
584     /* (xa + xc) - (xb + xd) */
585     r1 = r1 - t1;
586     /* yb + yd */
587     t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
588     /* ya' = ya + yb + yc + yd */
589     pSrc[(2U * i0) + 1U] = (s1 + t2);
590 
591     /* (ya + yc) - (yb + yd) */
592     s1 = s1 - t2;
593 
594     /* yb - yd */
595     t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
596     /* xb - xd */
597     t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
598 
599     /*  index calculation for the coefficients */
600     ia2 = 2U * ia1;
601     co2 = pCoef[ia2 * 2U];
602     si2 = pCoef[(ia2 * 2U) + 1U];
603 
604     /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
605     pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
606                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
607 
608     /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
609     pSrc[2U * i1 + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
610                           ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
611 
612     /* (xa - xc) - (yb - yd) */
613     r1 = r2 - t1;
614     /* (xa - xc) + (yb - yd) */
615     r2 = r2 + t1;
616 
617     /* (ya - yc) + (xb - xd) */
618     s1 = s2 + t2;
619     /* (ya - yc) - (xb - xd) */
620     s2 = s2 - t2;
621 
622     co1 = pCoef[ia1 * 2U];
623     si1 = pCoef[(ia1 * 2U) + 1U];
624 
625     /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
626     pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
627                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
628 
629     /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
630     pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
631                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
632 
633     /*  index calculation for the coefficients */
634     ia3 = 3U * ia1;
635     co3 = pCoef[ia3 * 2U];
636     si3 = pCoef[(ia3 * 2U) + 1U];
637 
638     /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
639     pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
640                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
641 
642     /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
643     pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
644                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
645 
646     /*  Twiddle coefficients index modifier */
647     ia1 = ia1 + twidCoefModifier;
648 
649     /*  Updating input index */
650     i0 = i0 + 1U;
651 
652   } while (--j);
653 
654   /* data is in 5.27(q27) format */
655   /* each stage provides two down scaling of the input */
656 
657 
658   /* Start of Middle stages process */
659 
660   twidCoefModifier <<= 2U;
661 
662   /*  Calculation of second stage to excluding last stage */
663   for (k = fftLen / 4U; k > 4U; k >>= 2U)
664   {
665     /*  Initializations for the first stage */
666     n1 = n2;
667     n2 >>= 2U;
668     ia1 = 0U;
669 
670     for (j = 0; j <= (n2 - 1U); j++)
671     {
672       /*  index calculation for the coefficients */
673       ia2 = ia1 + ia1;
674       ia3 = ia2 + ia1;
675       co1 = pCoef[(ia1 * 2U)];
676       si1 = pCoef[(ia1 * 2U) + 1U];
677       co2 = pCoef[(ia2 * 2U)];
678       si2 = pCoef[(ia2 * 2U) + 1U];
679       co3 = pCoef[(ia3 * 2U)];
680       si3 = pCoef[(ia3 * 2U) + 1U];
681       /*  Twiddle coefficients index modifier */
682       ia1 = ia1 + twidCoefModifier;
683 
684       for (i0 = j; i0 < fftLen; i0 += n1)
685       {
686         /*  index calculation for the input as, */
687         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
688         i1 = i0 + n2;
689         i2 = i1 + n2;
690         i3 = i2 + n2;
691 
692         /*  Butterfly implementation */
693         /* xa + xc */
694         r1 = pSrc[2U * i0] + pSrc[2U * i2];
695         /* xa - xc */
696         r2 = pSrc[2U * i0] - pSrc[2U * i2];
697 
698         /* ya + yc */
699         s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
700         /* ya - yc */
701         s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
702 
703         /* xb + xd */
704         t1 = pSrc[2U * i1] + pSrc[2U * i3];
705 
706         /* xa' = xa + xb + xc + xd */
707         pSrc[2U * i0] = (r1 + t1) >> 2U;
708         /* xa + xc -(xb + xd) */
709         r1 = r1 - t1;
710         /* yb + yd */
711         t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
712         /* ya' = ya + yb + yc + yd */
713         pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
714 
715         /* (ya + yc) - (yb + yd) */
716         s1 = s1 - t2;
717 
718         /* (yb - yd) */
719         t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
720         /* (xb - xd) */
721         t2 = pSrc[2U * i1] - pSrc[2U * i3];
722 
723         /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
724         pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
725                          ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
726 
727         /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
728         pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
729                                 ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
730 
731         /* (xa - xc) - (yb - yd) */
732         r1 = r2 - t1;
733         /* (xa - xc) + (yb - yd) */
734         r2 = r2 + t1;
735 
736         /* (ya - yc) +  (xb - xd) */
737         s1 = s2 + t2;
738         /* (ya - yc) -  (xb - xd) */
739         s2 = s2 - t2;
740 
741         /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
742         pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
743                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
744 
745         /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
746         pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
747                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
748 
749         /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
750         pSrc[(2U * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
751                            ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
752 
753         /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
754         pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
755                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
756       }
757     }
758     twidCoefModifier <<= 2U;
759   }
760 
761   /* End of Middle stages process */
762 
763   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
764   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
765   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
766   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
767 
768 
769   /* Start of last stage process */
770 
771 
772   /*  Initializations for the last stage */
773   j = fftLen >> 2;
774   ptr1 = &pSrc[0];
775 
776   /*  Calculations of last stage */
777   do
778   {
779     /* Read xa (real), ya(imag) input */
780     xa = *ptr1++;
781     ya = *ptr1++;
782 
783     /* Read xb (real), yb(imag) input */
784     xb = *ptr1++;
785     yb = *ptr1++;
786 
787     /* Read xc (real), yc(imag) input */
788     xc = *ptr1++;
789     yc = *ptr1++;
790 
791     /* Read xc (real), yc(imag) input */
792     xd = *ptr1++;
793     yd = *ptr1++;
794 
795     /* xa' = xa + xb + xc + xd */
796     xa_out = xa + xb + xc + xd;
797 
798     /* ya' = ya + yb + yc + yd */
799     ya_out = ya + yb + yc + yd;
800 
801     /* pointer updation for writing */
802     ptr1 = ptr1 - 8U;
803 
804     /* writing xa' and ya' */
805     *ptr1++ = xa_out;
806     *ptr1++ = ya_out;
807 
808     xc_out = (xa - xb + xc - xd);
809     yc_out = (ya - yb + yc - yd);
810 
811     /* writing xc' and yc' */
812     *ptr1++ = xc_out;
813     *ptr1++ = yc_out;
814 
815     xb_out = (xa - yb - xc + yd);
816     yb_out = (ya + xb - yc - xd);
817 
818     /* writing xb' and yb' */
819     *ptr1++ = xb_out;
820     *ptr1++ = yb_out;
821 
822     xd_out = (xa + yb - xc - yd);
823     yd_out = (ya - xb - yc + xd);
824 
825     /* writing xd' and yd' */
826     *ptr1++ = xd_out;
827     *ptr1++ = yd_out;
828 
829   } while (--j);
830 
831   /* output is in 11.21(q21) format for the 1024 point */
832   /* output is in 9.23(q23) format for the 256 point */
833   /* output is in 7.25(q25) format for the 64 point */
834   /* output is in 5.27(q27) format for the 16 point */
835 
836   /* End of last stage process */
837 }
838