1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_radix4_q31.c
4  * Description:  This file has function definition of Radix-4 FFT & IFFT function and
5  *               In-place bit reversal using bit reversal table
6  *
7  * $Date:        23 April 2021
8  * $Revision:    V1.9.0
9  *
10  * Target Processor: Cortex-M and Cortex-A cores
11  * -------------------------------------------------------------------- */
12 /*
13  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
14  *
15  * SPDX-License-Identifier: Apache-2.0
16  *
17  * Licensed under the Apache License, Version 2.0 (the License); you may
18  * not use this file except in compliance with the License.
19  * You may obtain a copy of the License at
20  *
21  * www.apache.org/licenses/LICENSE-2.0
22  *
23  * Unless required by applicable law or agreed to in writing, software
24  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26  * See the License for the specific language governing permissions and
27  * limitations under the License.
28  */
29 
30 #include "dsp/transform_functions.h"
31 
32 void arm_radix4_butterfly_inverse_q31(
33         q31_t * pSrc,
34         uint32_t fftLen,
35   const q31_t * pCoef,
36         uint32_t twidCoefModifier);
37 
38 void arm_radix4_butterfly_q31(
39         q31_t * pSrc,
40         uint32_t fftLen,
41   const q31_t * pCoef,
42         uint32_t twidCoefModifier);
43 
44 void arm_bitreversal_q31(
45         q31_t * pSrc,
46         uint32_t fftLen,
47         uint16_t bitRevFactor,
48   const uint16_t * pBitRevTab);
49 
50 
51 /**
52   @addtogroup ComplexFFTDeprecated
53   @{
54  */
55 
56 /**
57   @brief         Processing function for the Q31 CFFT/CIFFT.
58   @deprecated    Do not use this function.  It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
59   @param[in]     S    points to an instance of the Q31 CFFT/CIFFT structure
60   @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
61 
62   @par Input and output formats:
63                  Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
64                  Hence the output format is different for different FFT sizes.
65                  The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
66   @par
67 
68 | CFFT Size | Input format  | Output format | Number of bits to upscale |
69 | --------: | ------------: | ------------: | ------------------------: |
70 | 16        | 1.31          | 5.27          | 4                         |
71 | 64        | 1.31          | 7.25          | 6                         |
72 | 256       | 1.31          | 9.23          | 8                         |
73 | 1024      | 1.31          | 11.21         | 10                        |
74 
75 | CIFFT Size | Input format  | Output format | Number of bits to upscale |
76 | ---------: | ------------: | ------------: | ------------------------: |
77 | 16         | 1.31          | 5.27          | 0                         |
78 | 64         | 1.31          | 7.25          | 0                         |
79 | 256        | 1.31          | 9.23          | 0                         |
80 | 1024       | 1.31          | 11.21         | 0                         |
81 
82  */
83 
arm_cfft_radix4_q31(const arm_cfft_radix4_instance_q31 * S,q31_t * pSrc)84 void arm_cfft_radix4_q31(
85   const arm_cfft_radix4_instance_q31 * S,
86         q31_t * pSrc)
87 {
88   if (S->ifftFlag == 1U)
89   {
90     /* Complex IFFT radix-4 */
91     arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
92   }
93   else
94   {
95     /* Complex FFT radix-4 */
96     arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
97   }
98 
99   if (S->bitReverseFlag == 1U)
100   {
101     /*  Bit Reversal */
102     arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
103   }
104 
105 }
106 
107 /**
108   @} end of ComplexFFTDeprecated group
109  */
110 
111 /*
112  * Radix-4 FFT algorithm used is :
113  *
114  * Input real and imaginary data:
115  * x(n) = xa + j * ya
116  * x(n+N/4 ) = xb + j * yb
117  * x(n+N/2 ) = xc + j * yc
118  * x(n+3N 4) = xd + j * yd
119  *
120  *
121  * Output real and imaginary data:
122  * x(4r) = xa'+ j * ya'
123  * x(4r+1) = xb'+ j * yb'
124  * x(4r+2) = xc'+ j * yc'
125  * x(4r+3) = xd'+ j * yd'
126  *
127  *
128  * Twiddle factors for radix-4 FFT:
129  * Wn = co1 + j * (- si1)
130  * W2n = co2 + j * (- si2)
131  * W3n = co3 + j * (- si3)
132  *
133  *  Butterfly implementation:
134  * xa' = xa + xb + xc + xd
135  * ya' = ya + yb + yc + yd
136  * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
137  * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
138  * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
139  * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
140  * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
141  * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
142  *
143  */
144 
145 /**
146   @brief         Core function for the Q31 CFFT butterfly process.
147   @param[in,out] pSrc             points to the in-place buffer of Q31 data type.
148   @param[in]     fftLen           length of the FFT.
149   @param[in]     pCoef            points to twiddle coefficient buffer.
150   @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
151  */
152 
arm_radix4_butterfly_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pCoef,uint32_t twidCoefModifier)153 void arm_radix4_butterfly_q31(
154         q31_t * pSrc,
155         uint32_t fftLen,
156   const q31_t * pCoef,
157         uint32_t twidCoefModifier)
158 {
159         uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
160         q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
161 
162         q31_t xa, xb, xc, xd;
163         q31_t ya, yb, yc, yd;
164         q31_t xa_out, xb_out, xc_out, xd_out;
165         q31_t ya_out, yb_out, yc_out, yd_out;
166 
167         q31_t *ptr1;
168 
169   /* Total process is divided into three stages */
170 
171   /* process first stage, middle stages, & last stage */
172 
173 
174   /* start of first stage process */
175 
176   /*  Initializations for the first stage */
177   n2 = fftLen;
178   n1 = n2;
179   /* n2 = fftLen/4 */
180   n2 >>= 2U;
181   i0 = 0U;
182   ia1 = 0U;
183 
184   j = n2;
185 
186   /*  Calculation of first stage */
187   do
188   {
189     /*  index calculation for the input as, */
190     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
191     i1 = i0 + n2;
192     i2 = i1 + n2;
193     i3 = i2 + n2;
194 
195     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
196 
197     /*  Butterfly implementation */
198     /* xa + xc */
199     r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U);
200     /* xa - xc */
201     r2 = (pSrc[(2U * i0)] >> 4U) - (pSrc[(2U * i2)] >> 4U);
202 
203     /* xb + xd */
204     t1 = (pSrc[(2U * i1)] >> 4U) + (pSrc[(2U * i3)] >> 4U);
205 
206     /* ya + yc */
207     s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
208     /* ya - yc */
209     s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
210 
211     /* xa' = xa + xb + xc + xd */
212     pSrc[2U * i0] = (r1 + t1);
213     /* (xa + xc) - (xb + xd) */
214     r1 = r1 - t1;
215     /* yb + yd */
216     t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
217 
218     /* ya' = ya + yb + yc + yd */
219     pSrc[(2U * i0) + 1U] = (s1 + t2);
220 
221     /* (ya + yc) - (yb + yd) */
222     s1 = s1 - t2;
223 
224     /* yb - yd */
225     t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
226     /* xb - xd */
227     t2 = (pSrc[(2U * i1)] >> 4U) - (pSrc[(2U * i3)] >> 4U);
228 
229     /*  index calculation for the coefficients */
230     ia2 = 2U * ia1;
231     co2 = pCoef[(ia2 * 2U)];
232     si2 = pCoef[(ia2 * 2U) + 1U];
233 
234     /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
235     pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
236                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
237 
238     /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
239     pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
240                             ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
241 
242     /* (xa - xc) + (yb - yd) */
243     r1 = r2 + t1;
244     /* (xa - xc) - (yb - yd) */
245     r2 = r2 - t1;
246 
247     /* (ya - yc) - (xb - xd) */
248     s1 = s2 - t2;
249     /* (ya - yc) + (xb - xd) */
250     s2 = s2 + t2;
251 
252     co1 = pCoef[(ia1 * 2U)];
253     si1 = pCoef[(ia1 * 2U) + 1U];
254 
255     /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
256     pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
257                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
258 
259     /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
260     pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
261                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
262 
263     /*  index calculation for the coefficients */
264     ia3 = 3U * ia1;
265     co3 = pCoef[(ia3 * 2U)];
266     si3 = pCoef[(ia3 * 2U) + 1U];
267 
268     /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
269     pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
270                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
271 
272     /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
273     pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
274                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
275 
276     /*  Twiddle coefficients index modifier */
277     ia1 = ia1 + twidCoefModifier;
278 
279     /*  Updating input index */
280     i0 = i0 + 1U;
281 
282   } while (--j);
283 
284   /* end of first stage process */
285 
286   /* data is in 5.27(q27) format */
287 
288 
289   /* start of Middle stages process */
290 
291 
292   /* each stage in middle stages provides two down scaling of the input */
293 
294   twidCoefModifier <<= 2U;
295 
296 
297   for (k = fftLen / 4U; k > 4U; k >>= 2U)
298   {
299     /*  Initializations for the first stage */
300     n1 = n2;
301     n2 >>= 2U;
302     ia1 = 0U;
303 
304     /*  Calculation of first stage */
305     for (j = 0U; j <= (n2 - 1U); j++)
306     {
307       /*  index calculation for the coefficients */
308       ia2 = ia1 + ia1;
309       ia3 = ia2 + ia1;
310       co1 = pCoef[(ia1 * 2U)];
311       si1 = pCoef[(ia1 * 2U) + 1U];
312       co2 = pCoef[(ia2 * 2U)];
313       si2 = pCoef[(ia2 * 2U) + 1U];
314       co3 = pCoef[(ia3 * 2U)];
315       si3 = pCoef[(ia3 * 2U) + 1U];
316       /*  Twiddle coefficients index modifier */
317       ia1 = ia1 + twidCoefModifier;
318 
319       for (i0 = j; i0 < fftLen; i0 += n1)
320       {
321         /*  index calculation for the input as, */
322         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
323         i1 = i0 + n2;
324         i2 = i1 + n2;
325         i3 = i2 + n2;
326 
327         /*  Butterfly implementation */
328         /* xa + xc */
329         r1 = pSrc[2U * i0] + pSrc[2U * i2];
330         /* xa - xc */
331         r2 = pSrc[2U * i0] - pSrc[2U * i2];
332 
333         /* ya + yc */
334         s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
335         /* ya - yc */
336         s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
337 
338         /* xb + xd */
339         t1 = pSrc[2U * i1] + pSrc[2U * i3];
340 
341         /* xa' = xa + xb + xc + xd */
342         pSrc[2U * i0] = (r1 + t1) >> 2U;
343         /* xa + xc -(xb + xd) */
344         r1 = r1 - t1;
345 
346         /* yb + yd */
347         t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
348         /* ya' = ya + yb + yc + yd */
349         pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
350 
351         /* (ya + yc) - (yb + yd) */
352         s1 = s1 - t2;
353 
354         /* (yb - yd) */
355         t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
356         /* (xb - xd) */
357         t2 = pSrc[2U * i1] - pSrc[2U * i3];
358 
359         /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
360         pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
361                          ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
362 
363         /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
364         pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
365                                 ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
366 
367         /* (xa - xc) + (yb - yd) */
368         r1 = r2 + t1;
369         /* (xa - xc) - (yb - yd) */
370         r2 = r2 - t1;
371 
372         /* (ya - yc) -  (xb - xd) */
373         s1 = s2 - t2;
374         /* (ya - yc) +  (xb - xd) */
375         s2 = s2 + t2;
376 
377         /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
378         pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
379                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
380 
381         /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
382         pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
383                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
384 
385         /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
386         pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
387                          ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
388 
389         /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
390         pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
391                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
392       }
393     }
394     twidCoefModifier <<= 2U;
395   }
396 
397   /* End of Middle stages process */
398 
399   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
400   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
401   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
402   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
403 
404 
405   /* start of Last stage process */
406   /*  Initializations for the last stage */
407   j = fftLen >> 2;
408   ptr1 = &pSrc[0];
409 
410   /*  Calculations of last stage */
411   do
412   {
413     /* Read xa (real), ya(imag) input */
414     xa = *ptr1++;
415     ya = *ptr1++;
416 
417     /* Read xb (real), yb(imag) input */
418     xb = *ptr1++;
419     yb = *ptr1++;
420 
421     /* Read xc (real), yc(imag) input */
422     xc = *ptr1++;
423     yc = *ptr1++;
424 
425     /* Read xc (real), yc(imag) input */
426     xd = *ptr1++;
427     yd = *ptr1++;
428 
429     /* xa' = xa + xb + xc + xd */
430     xa_out = xa + xb + xc + xd;
431 
432     /* ya' = ya + yb + yc + yd */
433     ya_out = ya + yb + yc + yd;
434 
435     /* pointer updation for writing */
436     ptr1 = ptr1 - 8U;
437 
438     /* writing xa' and ya' */
439     *ptr1++ = xa_out;
440     *ptr1++ = ya_out;
441 
442     xc_out = (xa - xb + xc - xd);
443     yc_out = (ya - yb + yc - yd);
444 
445     /* writing xc' and yc' */
446     *ptr1++ = xc_out;
447     *ptr1++ = yc_out;
448 
449     xb_out = (xa + yb - xc - yd);
450     yb_out = (ya - xb - yc + xd);
451 
452     /* writing xb' and yb' */
453     *ptr1++ = xb_out;
454     *ptr1++ = yb_out;
455 
456     xd_out = (xa - yb - xc + yd);
457     yd_out = (ya + xb - yc - xd);
458 
459     /* writing xd' and yd' */
460     *ptr1++ = xd_out;
461     *ptr1++ = yd_out;
462 
463 
464   } while (--j);
465 
466   /* output is in 11.21(q21) format for the 1024 point */
467   /* output is in 9.23(q23) format for the 256 point */
468   /* output is in 7.25(q25) format for the 64 point */
469   /* output is in 5.27(q27) format for the 16 point */
470 
471   /* End of last stage process */
472 
473 }
474 
475 
476 /**
477   @brief         Core function for the Q31 CIFFT butterfly process.
478   @param[in,out] pSrc             points to the in-place buffer of Q31 data type.
479   @param[in]     fftLen           length of the FFT.
480   @param[in]     pCoef            points to twiddle coefficient buffer.
481   @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
482  */
483 
484 /*
485  * Radix-4 IFFT algorithm used is :
486  *
487  * CIFFT uses same twiddle coefficients as CFFT Function
488  *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
489  *
490  *
491  * IFFT is implemented with following changes in equations from FFT
492  *
493  * Input real and imaginary data:
494  * x(n) = xa + j * ya
495  * x(n+N/4 ) = xb + j * yb
496  * x(n+N/2 ) = xc + j * yc
497  * x(n+3N 4) = xd + j * yd
498  *
499  *
500  * Output real and imaginary data:
501  * x(4r) = xa'+ j * ya'
502  * x(4r+1) = xb'+ j * yb'
503  * x(4r+2) = xc'+ j * yc'
504  * x(4r+3) = xd'+ j * yd'
505  *
506  *
507  * Twiddle factors for radix-4 IFFT:
508  * Wn = co1 + j * (si1)
509  * W2n = co2 + j * (si2)
510  * W3n = co3 + j * (si3)
511 
512  * The real and imaginary output values for the radix-4 butterfly are
513  * xa' = xa + xb + xc + xd
514  * ya' = ya + yb + yc + yd
515  * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
516  * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
517  * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
518  * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
519  * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
520  * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
521  *
522  */
523 
arm_radix4_butterfly_inverse_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pCoef,uint32_t twidCoefModifier)524 void arm_radix4_butterfly_inverse_q31(
525         q31_t * pSrc,
526         uint32_t fftLen,
527   const q31_t * pCoef,
528         uint32_t twidCoefModifier)
529 {
530         uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
531         q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
532         q31_t xa, xb, xc, xd;
533         q31_t ya, yb, yc, yd;
534         q31_t xa_out, xb_out, xc_out, xd_out;
535         q31_t ya_out, yb_out, yc_out, yd_out;
536 
537         q31_t *ptr1;
538 
539   /* input is be 1.31(q31) format for all FFT sizes */
540   /* Total process is divided into three stages */
541   /* process first stage, middle stages, & last stage */
542 
543   /* Start of first stage process */
544 
545   /* Initializations for the first stage */
546   n2 = fftLen;
547   n1 = n2;
548   /* n2 = fftLen/4 */
549   n2 >>= 2U;
550   i0 = 0U;
551   ia1 = 0U;
552 
553   j = n2;
554 
555   do
556   {
557     /* input is in 1.31(q31) format and provide 4 guard bits for the input */
558 
559     /*  index calculation for the input as, */
560     /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
561     i1 = i0 + n2;
562     i2 = i1 + n2;
563     i3 = i2 + n2;
564 
565     /*  Butterfly implementation */
566     /* xa + xc */
567     r1 = (pSrc[2U * i0] >> 4U) + (pSrc[2U * i2] >> 4U);
568     /* xa - xc */
569     r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
570 
571     /* xb + xd */
572     t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
573 
574     /* ya + yc */
575     s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
576     /* ya - yc */
577     s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
578 
579     /* xa' = xa + xb + xc + xd */
580     pSrc[2U * i0] = (r1 + t1);
581     /* (xa + xc) - (xb + xd) */
582     r1 = r1 - t1;
583     /* yb + yd */
584     t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
585     /* ya' = ya + yb + yc + yd */
586     pSrc[(2U * i0) + 1U] = (s1 + t2);
587 
588     /* (ya + yc) - (yb + yd) */
589     s1 = s1 - t2;
590 
591     /* yb - yd */
592     t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
593     /* xb - xd */
594     t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
595 
596     /*  index calculation for the coefficients */
597     ia2 = 2U * ia1;
598     co2 = pCoef[ia2 * 2U];
599     si2 = pCoef[(ia2 * 2U) + 1U];
600 
601     /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
602     pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
603                      ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
604 
605     /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
606     pSrc[2U * i1 + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
607                           ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
608 
609     /* (xa - xc) - (yb - yd) */
610     r1 = r2 - t1;
611     /* (xa - xc) + (yb - yd) */
612     r2 = r2 + t1;
613 
614     /* (ya - yc) + (xb - xd) */
615     s1 = s2 + t2;
616     /* (ya - yc) - (xb - xd) */
617     s2 = s2 - t2;
618 
619     co1 = pCoef[ia1 * 2U];
620     si1 = pCoef[(ia1 * 2U) + 1U];
621 
622     /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
623     pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
624                      ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
625 
626     /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
627     pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
628                             ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
629 
630     /*  index calculation for the coefficients */
631     ia3 = 3U * ia1;
632     co3 = pCoef[ia3 * 2U];
633     si3 = pCoef[(ia3 * 2U) + 1U];
634 
635     /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
636     pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
637                      ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
638 
639     /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
640     pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
641                             ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
642 
643     /*  Twiddle coefficients index modifier */
644     ia1 = ia1 + twidCoefModifier;
645 
646     /*  Updating input index */
647     i0 = i0 + 1U;
648 
649   } while (--j);
650 
651   /* data is in 5.27(q27) format */
652   /* each stage provides two down scaling of the input */
653 
654 
655   /* Start of Middle stages process */
656 
657   twidCoefModifier <<= 2U;
658 
659   /*  Calculation of second stage to excluding last stage */
660   for (k = fftLen / 4U; k > 4U; k >>= 2U)
661   {
662     /*  Initializations for the first stage */
663     n1 = n2;
664     n2 >>= 2U;
665     ia1 = 0U;
666 
667     for (j = 0; j <= (n2 - 1U); j++)
668     {
669       /*  index calculation for the coefficients */
670       ia2 = ia1 + ia1;
671       ia3 = ia2 + ia1;
672       co1 = pCoef[(ia1 * 2U)];
673       si1 = pCoef[(ia1 * 2U) + 1U];
674       co2 = pCoef[(ia2 * 2U)];
675       si2 = pCoef[(ia2 * 2U) + 1U];
676       co3 = pCoef[(ia3 * 2U)];
677       si3 = pCoef[(ia3 * 2U) + 1U];
678       /*  Twiddle coefficients index modifier */
679       ia1 = ia1 + twidCoefModifier;
680 
681       for (i0 = j; i0 < fftLen; i0 += n1)
682       {
683         /*  index calculation for the input as, */
684         /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
685         i1 = i0 + n2;
686         i2 = i1 + n2;
687         i3 = i2 + n2;
688 
689         /*  Butterfly implementation */
690         /* xa + xc */
691         r1 = pSrc[2U * i0] + pSrc[2U * i2];
692         /* xa - xc */
693         r2 = pSrc[2U * i0] - pSrc[2U * i2];
694 
695         /* ya + yc */
696         s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
697         /* ya - yc */
698         s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
699 
700         /* xb + xd */
701         t1 = pSrc[2U * i1] + pSrc[2U * i3];
702 
703         /* xa' = xa + xb + xc + xd */
704         pSrc[2U * i0] = (r1 + t1) >> 2U;
705         /* xa + xc -(xb + xd) */
706         r1 = r1 - t1;
707         /* yb + yd */
708         t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
709         /* ya' = ya + yb + yc + yd */
710         pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
711 
712         /* (ya + yc) - (yb + yd) */
713         s1 = s1 - t2;
714 
715         /* (yb - yd) */
716         t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
717         /* (xb - xd) */
718         t2 = pSrc[2U * i1] - pSrc[2U * i3];
719 
720         /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
721         pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
722                          ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
723 
724         /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
725         pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
726                                 ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
727 
728         /* (xa - xc) - (yb - yd) */
729         r1 = r2 - t1;
730         /* (xa - xc) + (yb - yd) */
731         r2 = r2 + t1;
732 
733         /* (ya - yc) +  (xb - xd) */
734         s1 = s2 + t2;
735         /* (ya - yc) -  (xb - xd) */
736         s2 = s2 - t2;
737 
738         /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
739         pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
740                          ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
741 
742         /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
743         pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
744                                 ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
745 
746         /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
747         pSrc[(2U * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
748                            ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
749 
750         /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
751         pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
752                                 ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
753       }
754     }
755     twidCoefModifier <<= 2U;
756   }
757 
758   /* End of Middle stages process */
759 
760   /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
761   /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
762   /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
763   /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
764 
765 
766   /* Start of last stage process */
767 
768 
769   /*  Initializations for the last stage */
770   j = fftLen >> 2;
771   ptr1 = &pSrc[0];
772 
773   /*  Calculations of last stage */
774   do
775   {
776     /* Read xa (real), ya(imag) input */
777     xa = *ptr1++;
778     ya = *ptr1++;
779 
780     /* Read xb (real), yb(imag) input */
781     xb = *ptr1++;
782     yb = *ptr1++;
783 
784     /* Read xc (real), yc(imag) input */
785     xc = *ptr1++;
786     yc = *ptr1++;
787 
788     /* Read xc (real), yc(imag) input */
789     xd = *ptr1++;
790     yd = *ptr1++;
791 
792     /* xa' = xa + xb + xc + xd */
793     xa_out = xa + xb + xc + xd;
794 
795     /* ya' = ya + yb + yc + yd */
796     ya_out = ya + yb + yc + yd;
797 
798     /* pointer updation for writing */
799     ptr1 = ptr1 - 8U;
800 
801     /* writing xa' and ya' */
802     *ptr1++ = xa_out;
803     *ptr1++ = ya_out;
804 
805     xc_out = (xa - xb + xc - xd);
806     yc_out = (ya - yb + yc - yd);
807 
808     /* writing xc' and yc' */
809     *ptr1++ = xc_out;
810     *ptr1++ = yc_out;
811 
812     xb_out = (xa - yb - xc + yd);
813     yb_out = (ya + xb - yc - xd);
814 
815     /* writing xb' and yb' */
816     *ptr1++ = xb_out;
817     *ptr1++ = yb_out;
818 
819     xd_out = (xa + yb - xc - yd);
820     yd_out = (ya - xb - yc + xd);
821 
822     /* writing xd' and yd' */
823     *ptr1++ = xd_out;
824     *ptr1++ = yd_out;
825 
826   } while (--j);
827 
828   /* output is in 11.21(q21) format for the 1024 point */
829   /* output is in 9.23(q23) format for the 256 point */
830   /* output is in 7.25(q25) format for the 64 point */
831   /* output is in 5.27(q27) format for the 16 point */
832 
833   /* End of last stage process */
834 }
835