1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_radix4_q15.c
4  * Description:  This file has function definition of Radix-4 FFT & IFFT function and
5  *               In-place bit reversal using bit reversal table
6  *
7  * $Date:        23 April 2021
8  * $Revision:    V1.9.0
9  *
10  * Target Processor: Cortex-M and Cortex-A cores
11  * -------------------------------------------------------------------- */
12 /*
13  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
14  *
15  * SPDX-License-Identifier: Apache-2.0
16  *
17  * Licensed under the Apache License, Version 2.0 (the License); you may
18  * not use this file except in compliance with the License.
19  * You may obtain a copy of the License at
20  *
21  * www.apache.org/licenses/LICENSE-2.0
22  *
23  * Unless required by applicable law or agreed to in writing, software
24  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26  * See the License for the specific language governing permissions and
27  * limitations under the License.
28  */
29 
30 #include "dsp/transform_functions.h"
31 
32 
33 void arm_radix4_butterfly_q15(
34         q15_t * pSrc16,
35         uint32_t fftLen,
36   const q15_t * pCoef16,
37         uint32_t twidCoefModifier);
38 
39 void arm_radix4_butterfly_inverse_q15(
40         q15_t * pSrc16,
41         uint32_t fftLen,
42   const q15_t * pCoef16,
43         uint32_t twidCoefModifier);
44 
45 void arm_bitreversal_q15(
46         q15_t * pSrc,
47         uint32_t fftLen,
48         uint16_t bitRevFactor,
49   const uint16_t * pBitRevTab);
50 
51 /**
52   @addtogroup ComplexFFTDeprecated
53   @{
54  */
55 
56 
57 /**
58   @brief               Processing function for the Q15 CFFT/CIFFT.
59   @deprecated          Do not use this function.  It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
60   @param[in]     S     points to an instance of the Q15 CFFT/CIFFT structure.
61   @param[in,out] pSrc  points to the complex data buffer. Processing occurs in-place.
62 
63   @par Input and output formats:
64                  Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
65                  Hence the output format is different for different FFT sizes.
66                  The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
67   @par
68 
69 | CFFT Size | Input format  | Output format | Number of bits to upscale |
70 | --------: | ------------: | ------------: | ------------------------: |
71 | 16        | 1.15          | 5.11          | 4                         |
72 | 64        | 1.15          | 7.9           | 6                         |
73 | 256       | 1.15          | 9.7           | 8                         |
74 | 1024      | 1.15          | 11.5          | 10                        |
75 
76 | CIFFT Size | Input format  | Output format | Number of bits to upscale |
77 | ---------: | ------------: | ------------: | ------------------------: |
78 | 16         | 1.15          | 5.11          | 0                         |
79 | 64         | 1.15          | 7.9           | 0                         |
80 | 256        | 1.15          | 9.7           | 0                         |
81 | 1024       | 1.15          | 11.5          | 0                         |
82 
83  */
84 
arm_cfft_radix4_q15(const arm_cfft_radix4_instance_q15 * S,q15_t * pSrc)85 void arm_cfft_radix4_q15(
86   const arm_cfft_radix4_instance_q15 * S,
87         q15_t * pSrc)
88 {
89   if (S->ifftFlag == 1U)
90   {
91     /*  Complex IFFT radix-4  */
92     arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
93   }
94   else
95   {
96     /*  Complex FFT radix-4  */
97     arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
98   }
99 
100   if (S->bitReverseFlag == 1U)
101   {
102     /*  Bit Reversal */
103     arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
104   }
105 
106 }
107 
108 /**
109   @} end of ComplexFFTDeprecated group
110  */
111 
112 /*
113  * Radix-4 FFT algorithm used is :
114  *
115  * Input real and imaginary data:
116  * x(n) = xa + j * ya
117  * x(n+N/4 ) = xb + j * yb
118  * x(n+N/2 ) = xc + j * yc
119  * x(n+3N 4) = xd + j * yd
120  *
121  *
122  * Output real and imaginary data:
123  * x(4r) = xa'+ j * ya'
124  * x(4r+1) = xb'+ j * yb'
125  * x(4r+2) = xc'+ j * yc'
126  * x(4r+3) = xd'+ j * yd'
127  *
128  *
129  * Twiddle factors for radix-4 FFT:
130  * Wn = co1 + j * (- si1)
131  * W2n = co2 + j * (- si2)
132  * W3n = co3 + j * (- si3)
133 
134  * The real and imaginary output values for the radix-4 butterfly are
135  * xa' = xa + xb + xc + xd
136  * ya' = ya + yb + yc + yd
137  * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
138  * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
139  * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
140  * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
141  * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
142  * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
143  *
144  */
145 
146 /**
147   @brief         Core function for the Q15 CFFT butterfly process.
148   @param[in,out] pSrc16          points to the in-place buffer of Q15 data type
149   @param[in]     fftLen           length of the FFT
150   @param[in]     pCoef16         points to twiddle coefficient buffer
151   @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
152  */
153 
arm_radix4_butterfly_q15(q15_t * pSrc16,uint32_t fftLen,const q15_t * pCoef16,uint32_t twidCoefModifier)154 void arm_radix4_butterfly_q15(
155         q15_t * pSrc16,
156         uint32_t fftLen,
157   const q15_t * pCoef16,
158         uint32_t twidCoefModifier)
159 {
160 
161 #if defined (ARM_MATH_DSP)
162 
163         q31_t R, S, T, U;
164         q31_t C1, C2, C3, out1, out2;
165         uint32_t n1, n2, ic, i0, j, k;
166 
167         q15_t *ptr1;
168         q15_t *pSi0;
169         q15_t *pSi1;
170         q15_t *pSi2;
171         q15_t *pSi3;
172 
173         q31_t xaya, xbyb, xcyc, xdyd;
174 
175   /* Total process is divided into three stages */
176 
177   /* process first stage, middle stages, & last stage */
178 
179   /*  Initializations for the first stage */
180   n2 = fftLen;
181   n1 = n2;
182 
183   /* n2 = fftLen/4 */
184   n2 >>= 2U;
185 
186   /* Index for twiddle coefficient */
187   ic = 0U;
188 
189   /* Index for input read and output write */
190   j = n2;
191 
192   pSi0 = pSrc16;
193   pSi1 = pSi0 + 2 * n2;
194   pSi2 = pSi1 + 2 * n2;
195   pSi3 = pSi2 + 2 * n2;
196 
197   /* Input is in 1.15(q15) format */
198 
199   /*  start of first stage process */
200   do
201   {
202     /*  Butterfly implementation */
203 
204     /* Reading i0, i0+fftLen/2 inputs */
205     /* Read ya (real), xa(imag) input */
206     T = read_q15x2 (pSi0);
207     T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
208     T = __SHADD16(T, 0); /* it turns out doing this twice is 2 cycles, the alternative takes 3 cycles */
209 /*
210     in = ((int16_t) (T & 0xFFFF)) >> 2;       // alternative code that takes 3 cycles
211      T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
212 */
213 
214     /* Read yc (real), xc(imag) input */
215     S = read_q15x2 (pSi2);
216     S = __SHADD16(S, 0);
217     S = __SHADD16(S, 0);
218 
219     /* R = packed((ya + yc), (xa + xc) ) */
220     R = __QADD16(T, S);
221 
222     /* S = packed((ya - yc), (xa - xc) ) */
223     S = __QSUB16(T, S);
224 
225     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
226     /* Read yb (real), xb(imag) input */
227     T = read_q15x2 (pSi1);
228     T = __SHADD16(T, 0);
229     T = __SHADD16(T, 0);
230 
231     /* Read yd (real), xd(imag) input */
232     U = read_q15x2 (pSi3);
233     U = __SHADD16(U, 0);
234     U = __SHADD16(U, 0);
235 
236     /* T = packed((yb + yd), (xb + xd) ) */
237     T = __QADD16(T, U);
238 
239     /*  writing the butterfly processed i0 sample */
240     /* xa' = xa + xb + xc + xd */
241     /* ya' = ya + yb + yc + yd */
242     write_q15x2_ia (&pSi0, __SHADD16(R, T));
243 
244     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
245     R = __QSUB16(R, T);
246 
247     /* co2 & si2 are read from SIMD Coefficient pointer */
248     C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
249 
250 #ifndef ARM_MATH_BIG_ENDIAN
251     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
252     out1 = __SMUAD(C2, R) >> 16U;
253     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
254     out2 = __SMUSDX(C2, R);
255 #else
256     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
257     out1 = __SMUSDX(R, C2) >> 16U;
258     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
259     out2 = __SMUAD(C2, R);
260 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
261 
262     /*  Reading i0+fftLen/4 */
263     /* T = packed(yb, xb) */
264     T = read_q15x2 (pSi1);
265     T = __SHADD16(T, 0);
266     T = __SHADD16(T, 0);
267 
268     /* writing the butterfly processed i0 + fftLen/4 sample */
269     /* writing output(xc', yc') in little endian format */
270     write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
271 
272     /*  Butterfly calculations */
273     /* U = packed(yd, xd) */
274     U = read_q15x2 (pSi3);
275     U = __SHADD16(U, 0);
276     U = __SHADD16(U, 0);
277 
278     /* T = packed(yb-yd, xb-xd) */
279     T = __QSUB16(T, U);
280 
281 #ifndef ARM_MATH_BIG_ENDIAN
282     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
283     R = __QASX(S, T);
284     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
285     S = __QSAX(S, T);
286 #else
287     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
288     R = __QSAX(S, T);
289     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
290     S = __QASX(S, T);
291 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
292 
293     /* co1 & si1 are read from SIMD Coefficient pointer */
294     C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
295     /*  Butterfly process for the i0+fftLen/2 sample */
296 
297 #ifndef ARM_MATH_BIG_ENDIAN
298     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
299     out1 = __SMUAD(C1, S) >> 16U;
300     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
301     out2 = __SMUSDX(C1, S);
302 #else
303     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
304     out1 = __SMUSDX(S, C1) >> 16U;
305     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
306     out2 = __SMUAD(C1, S);
307 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
308 
309     /* writing output(xb', yb') in little endian format */
310     write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
311 
312     /* co3 & si3 are read from SIMD Coefficient pointer */
313     C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
314     /*  Butterfly process for the i0+3fftLen/4 sample */
315 
316 #ifndef ARM_MATH_BIG_ENDIAN
317     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
318     out1 = __SMUAD(C3, R) >> 16U;
319     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
320     out2 = __SMUSDX(C3, R);
321 #else
322     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
323     out1 = __SMUSDX(R, C3) >> 16U;
324     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
325     out2 = __SMUAD(C3, R);
326 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
327 
328     /* writing output(xd', yd') in little endian format */
329     write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
330 
331     /*  Twiddle coefficients index modifier */
332     ic = ic + twidCoefModifier;
333 
334   } while (--j);
335   /* data is in 4.11(q11) format */
336 
337   /* end of first stage process */
338 
339 
340   /* start of middle stage process */
341 
342   /*  Twiddle coefficients index modifier */
343   twidCoefModifier <<= 2U;
344 
345   /*  Calculation of Middle stage */
346   for (k = fftLen / 4U; k > 4U; k >>= 2U)
347   {
348     /*  Initializations for the middle stage */
349     n1 = n2;
350     n2 >>= 2U;
351     ic = 0U;
352 
353     for (j = 0U; j <= (n2 - 1U); j++)
354     {
355       /*  index calculation for the coefficients */
356       C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
357       C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
358       C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
359 
360       /*  Twiddle coefficients index modifier */
361       ic = ic + twidCoefModifier;
362 
363       pSi0 = pSrc16 + 2 * j;
364       pSi1 = pSi0 + 2 * n2;
365       pSi2 = pSi1 + 2 * n2;
366       pSi3 = pSi2 + 2 * n2;
367 
368       /*  Butterfly implementation */
369       for (i0 = j; i0 < fftLen; i0 += n1)
370       {
371         /*  Reading i0, i0+fftLen/2 inputs */
372         /* Read ya (real), xa(imag) input */
373         T = read_q15x2 (pSi0);
374 
375         /* Read yc (real), xc(imag) input */
376         S = read_q15x2 (pSi2);
377 
378         /* R = packed( (ya + yc), (xa + xc)) */
379         R = __QADD16(T, S);
380 
381         /* S = packed((ya - yc), (xa - xc)) */
382         S = __QSUB16(T, S);
383 
384         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
385         /* Read yb (real), xb(imag) input */
386         T = read_q15x2 (pSi1);
387 
388         /* Read yd (real), xd(imag) input */
389         U = read_q15x2 (pSi3);
390 
391         /* T = packed( (yb + yd), (xb + xd)) */
392         T = __QADD16(T, U);
393 
394         /*  writing the butterfly processed i0 sample */
395 
396         /* xa' = xa + xb + xc + xd */
397         /* ya' = ya + yb + yc + yd */
398         out1 = __SHADD16(R, T);
399         out1 = __SHADD16(out1, 0);
400         write_q15x2 (pSi0, out1);
401         pSi0 += 2 * n1;
402 
403         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
404         R = __SHSUB16(R, T);
405 
406 #ifndef ARM_MATH_BIG_ENDIAN
407         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
408         out1 = __SMUAD(C2, R) >> 16U;
409 
410         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
411         out2 = __SMUSDX(C2, R);
412 #else
413         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
414         out1 = __SMUSDX(R, C2) >> 16U;
415 
416         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
417         out2 = __SMUAD(C2, R);
418 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
419 
420         /*  Reading i0+3fftLen/4 */
421         /* Read yb (real), xb(imag) input */
422         T = read_q15x2 (pSi1);
423 
424         /*  writing the butterfly processed i0 + fftLen/4 sample */
425         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
426         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
427         write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
428         pSi1 += 2 * n1;
429 
430         /*  Butterfly calculations */
431 
432         /* Read yd (real), xd(imag) input */
433         U = read_q15x2 (pSi3);
434 
435         /* T = packed(yb-yd, xb-xd) */
436         T = __QSUB16(T, U);
437 
438 #ifndef ARM_MATH_BIG_ENDIAN
439         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
440         R = __SHASX(S, T);
441 
442         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
443         S = __SHSAX(S, T);
444 
445 
446         /*  Butterfly process for the i0+fftLen/2 sample */
447         out1 = __SMUAD(C1, S) >> 16U;
448         out2 = __SMUSDX(C1, S);
449 #else
450         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
451         R = __SHSAX(S, T);
452 
453         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
454         S = __SHASX(S, T);
455 
456 
457         /*  Butterfly process for the i0+fftLen/2 sample */
458         out1 = __SMUSDX(S, C1) >> 16U;
459         out2 = __SMUAD(C1, S);
460 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
461 
462         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
463         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
464         write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
465         pSi2 += 2 * n1;
466 
467         /*  Butterfly process for the i0+3fftLen/4 sample */
468 
469 #ifndef ARM_MATH_BIG_ENDIAN
470         out1 = __SMUAD(C3, R) >> 16U;
471         out2 = __SMUSDX(C3, R);
472 #else
473         out1 = __SMUSDX(R, C3) >> 16U;
474         out2 = __SMUAD(C3, R);
475 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
476 
477         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
478         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
479         write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
480         pSi3 += 2 * n1;
481       }
482     }
483     /*  Twiddle coefficients index modifier */
484     twidCoefModifier <<= 2U;
485   }
486   /* end of middle stage process */
487 
488 
489   /* data is in 10.6(q6) format for the 1024 point */
490   /* data is in 8.8(q8) format for the 256 point */
491   /* data is in 6.10(q10) format for the 64 point */
492   /* data is in 4.12(q12) format for the 16 point */
493 
494   /*  Initializations for the last stage */
495   j = fftLen >> 2;
496 
497   ptr1 = &pSrc16[0];
498 
499   /* start of last stage process */
500 
501   /*  Butterfly implementation */
502   do
503   {
504     /* Read xa (real), ya(imag) input */
505     xaya = read_q15x2_ia (&ptr1);
506 
507     /* Read xb (real), yb(imag) input */
508     xbyb = read_q15x2_ia (&ptr1);
509 
510     /* Read xc (real), yc(imag) input */
511     xcyc = read_q15x2_ia (&ptr1);
512 
513     /* Read xd (real), yd(imag) input */
514     xdyd = read_q15x2_ia (&ptr1);
515 
516     /* R = packed((ya + yc), (xa + xc)) */
517     R = __QADD16(xaya, xcyc);
518 
519     /* T = packed((yb + yd), (xb + xd)) */
520     T = __QADD16(xbyb, xdyd);
521 
522     /* pointer updation for writing */
523     ptr1 = ptr1 - 8U;
524 
525 
526     /* xa' = xa + xb + xc + xd */
527     /* ya' = ya + yb + yc + yd */
528     write_q15x2_ia (&ptr1, __SHADD16(R, T));
529 
530     /* T = packed((yb + yd), (xb + xd)) */
531     T = __QADD16(xbyb, xdyd);
532 
533     /* xc' = (xa-xb+xc-xd) */
534     /* yc' = (ya-yb+yc-yd) */
535     write_q15x2_ia (&ptr1, __SHSUB16(R, T));
536 
537     /* S = packed((ya - yc), (xa - xc)) */
538     S = __QSUB16(xaya, xcyc);
539 
540     /* Read yd (real), xd(imag) input */
541     /* T = packed( (yb - yd), (xb - xd))  */
542     U = __QSUB16(xbyb, xdyd);
543 
544 #ifndef ARM_MATH_BIG_ENDIAN
545     /* xb' = (xa+yb-xc-yd) */
546     /* yb' = (ya-xb-yc+xd) */
547     write_q15x2_ia (&ptr1, __SHSAX(S, U));
548 
549     /* xd' = (xa-yb-xc+yd) */
550     /* yd' = (ya+xb-yc-xd) */
551     write_q15x2_ia (&ptr1, __SHASX(S, U));
552 #else
553     /* xb' = (xa+yb-xc-yd) */
554     /* yb' = (ya-xb-yc+xd) */
555     write_q15x2_ia (&ptr1, __SHASX(S, U));
556 
557     /* xd' = (xa-yb-xc+yd) */
558     /* yd' = (ya+xb-yc-xd) */
559     write_q15x2_ia (&ptr1, __SHSAX(S, U));
560 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
561 
562   } while (--j);
563 
564   /* end of last stage process */
565 
566   /* output is in 11.5(q5) format for the 1024 point */
567   /* output is in 9.7(q7) format for the 256 point   */
568   /* output is in 7.9(q9) format for the 64 point  */
569   /* output is in 5.11(q11) format for the 16 point  */
570 
571 
572 #else /* #if defined (ARM_MATH_DSP) */
573 
574         q15_t R0, R1, S0, S1, T0, T1, U0, U1;
575         q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
576         uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
577 
578   /* Total process is divided into three stages */
579 
580   /* process first stage, middle stages, & last stage */
581 
582   /*  Initializations for the first stage */
583   n2 = fftLen;
584   n1 = n2;
585 
586   /* n2 = fftLen/4 */
587   n2 >>= 2U;
588 
589   /* Index for twiddle coefficient */
590   ic = 0U;
591 
592   /* Index for input read and output write */
593   i0 = 0U;
594   j = n2;
595 
596   /* Input is in 1.15(q15) format */
597 
598   /*  start of first stage process */
599   do
600   {
601     /*  Butterfly implementation */
602 
603     /*  index calculation for the input as, */
604     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
605     i1 = i0 + n2;
606     i2 = i1 + n2;
607     i3 = i2 + n2;
608 
609     /*  Reading i0, i0+fftLen/2 inputs */
610 
611     /* input is down scale by 4 to avoid overflow */
612     /* Read ya (real), xa(imag) input */
613     T0 = pSrc16[i0 * 2U] >> 2U;
614     T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
615 
616     /* input is down scale by 4 to avoid overflow */
617     /* Read yc (real), xc(imag) input */
618     S0 = pSrc16[i2 * 2U] >> 2U;
619     S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
620 
621     /* R0 = (ya + yc) */
622     R0 = __SSAT(T0 + S0, 16U);
623     /* R1 = (xa + xc) */
624     R1 = __SSAT(T1 + S1, 16U);
625 
626     /* S0 = (ya - yc) */
627     S0 = __SSAT(T0 - S0, 16);
628     /* S1 = (xa - xc) */
629     S1 = __SSAT(T1 - S1, 16);
630 
631     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
632     /* input is down scale by 4 to avoid overflow */
633     /* Read yb (real), xb(imag) input */
634     T0 = pSrc16[i1 * 2U] >> 2U;
635     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
636 
637     /* input is down scale by 4 to avoid overflow */
638     /* Read yd (real), xd(imag) input */
639     U0 = pSrc16[i3 * 2U] >> 2U;
640     U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
641 
642     /* T0 = (yb + yd) */
643     T0 = __SSAT(T0 + U0, 16U);
644     /* T1 = (xb + xd) */
645     T1 = __SSAT(T1 + U1, 16U);
646 
647     /*  writing the butterfly processed i0 sample */
648     /* ya' = ya + yb + yc + yd */
649     /* xa' = xa + xb + xc + xd */
650     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
651     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
652 
653     /* R0 = (ya + yc) - (yb + yd) */
654     /* R1 = (xa + xc) - (xb + xd) */
655     R0 = __SSAT(R0 - T0, 16U);
656     R1 = __SSAT(R1 - T1, 16U);
657 
658     /* co2 & si2 are read from Coefficient pointer */
659     Co2 = pCoef16[2U * ic * 2U];
660     Si2 = pCoef16[(2U * ic * 2U) + 1];
661 
662     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
663     out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
664     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
665     out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
666 
667     /*  Reading i0+fftLen/4 */
668     /* input is down scale by 4 to avoid overflow */
669     /* T0 = yb, T1 =  xb */
670     T0 = pSrc16[i1 * 2U] >> 2;
671     T1 = pSrc16[(i1 * 2U) + 1] >> 2;
672 
673     /* writing the butterfly processed i0 + fftLen/4 sample */
674     /* writing output(xc', yc') in little endian format */
675     pSrc16[i1 * 2U] = out1;
676     pSrc16[(i1 * 2U) + 1] = out2;
677 
678     /*  Butterfly calculations */
679     /* input is down scale by 4 to avoid overflow */
680     /* U0 = yd, U1 = xd */
681     U0 = pSrc16[i3 * 2U] >> 2;
682     U1 = pSrc16[(i3 * 2U) + 1] >> 2;
683     /* T0 = yb-yd */
684     T0 = __SSAT(T0 - U0, 16);
685     /* T1 = xb-xd */
686     T1 = __SSAT(T1 - U1, 16);
687 
688     /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
689     R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
690     R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
691 
692     /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
693     S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
694     S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
695 
696     /* co1 & si1 are read from Coefficient pointer */
697     Co1 = pCoef16[ic * 2U];
698     Si1 = pCoef16[(ic * 2U) + 1];
699     /*  Butterfly process for the i0+fftLen/2 sample */
700     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
701     out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
702     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
703     out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
704 
705     /* writing output(xb', yb') in little endian format */
706     pSrc16[i2 * 2U] = out1;
707     pSrc16[(i2 * 2U) + 1] = out2;
708 
709     /* Co3 & si3 are read from Coefficient pointer */
710     Co3 = pCoef16[3U * (ic * 2U)];
711     Si3 = pCoef16[(3U * (ic * 2U)) + 1];
712     /*  Butterfly process for the i0+3fftLen/4 sample */
713     /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
714     out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
715     /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
716     out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
717     /* writing output(xd', yd') in little endian format */
718     pSrc16[i3 * 2U] = out1;
719     pSrc16[(i3 * 2U) + 1] = out2;
720 
721     /*  Twiddle coefficients index modifier */
722     ic = ic + twidCoefModifier;
723 
724     /*  Updating input index */
725     i0 = i0 + 1U;
726 
727   } while (--j);
728   /* data is in 4.11(q11) format */
729 
730   /* end of first stage process */
731 
732 
733   /* start of middle stage process */
734 
735   /*  Twiddle coefficients index modifier */
736   twidCoefModifier <<= 2U;
737 
738   /*  Calculation of Middle stage */
739   for (k = fftLen / 4U; k > 4U; k >>= 2U)
740   {
741     /*  Initializations for the middle stage */
742     n1 = n2;
743     n2 >>= 2U;
744     ic = 0U;
745 
746     for (j = 0U; j <= (n2 - 1U); j++)
747     {
748       /*  index calculation for the coefficients */
749       Co1 = pCoef16[ic * 2U];
750       Si1 = pCoef16[(ic * 2U) + 1U];
751       Co2 = pCoef16[2U * (ic * 2U)];
752       Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
753       Co3 = pCoef16[3U * (ic * 2U)];
754       Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
755 
756       /*  Twiddle coefficients index modifier */
757       ic = ic + twidCoefModifier;
758 
759       /*  Butterfly implementation */
760       for (i0 = j; i0 < fftLen; i0 += n1)
761       {
762         /*  index calculation for the input as, */
763         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
764         i1 = i0 + n2;
765         i2 = i1 + n2;
766         i3 = i2 + n2;
767 
768         /*  Reading i0, i0+fftLen/2 inputs */
769         /* Read ya (real), xa(imag) input */
770         T0 = pSrc16[i0 * 2U];
771         T1 = pSrc16[(i0 * 2U) + 1U];
772 
773         /* Read yc (real), xc(imag) input */
774         S0 = pSrc16[i2 * 2U];
775         S1 = pSrc16[(i2 * 2U) + 1U];
776 
777         /* R0 = (ya + yc), R1 = (xa + xc) */
778         R0 = __SSAT(T0 + S0, 16);
779         R1 = __SSAT(T1 + S1, 16);
780 
781         /* S0 = (ya - yc), S1 =(xa - xc) */
782         S0 = __SSAT(T0 - S0, 16);
783         S1 = __SSAT(T1 - S1, 16);
784 
785         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
786         /* Read yb (real), xb(imag) input */
787         T0 = pSrc16[i1 * 2U];
788         T1 = pSrc16[(i1 * 2U) + 1U];
789 
790         /* Read yd (real), xd(imag) input */
791         U0 = pSrc16[i3 * 2U];
792         U1 = pSrc16[(i3 * 2U) + 1U];
793 
794 
795         /* T0 = (yb + yd), T1 = (xb + xd) */
796         T0 = __SSAT(T0 + U0, 16);
797         T1 = __SSAT(T1 + U1, 16);
798 
799         /*  writing the butterfly processed i0 sample */
800 
801         /* xa' = xa + xb + xc + xd */
802         /* ya' = ya + yb + yc + yd */
803         out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
804         out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
805 
806         pSrc16[i0 * 2U] = out1;
807         pSrc16[(2U * i0) + 1U] = out2;
808 
809         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
810         R0 = (R0 >> 1U) - (T0 >> 1U);
811         R1 = (R1 >> 1U) - (T1 >> 1U);
812 
813         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
814         out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
815 
816         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
817         out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
818 
819         /*  Reading i0+3fftLen/4 */
820         /* Read yb (real), xb(imag) input */
821         T0 = pSrc16[i1 * 2U];
822         T1 = pSrc16[(i1 * 2U) + 1U];
823 
824         /*  writing the butterfly processed i0 + fftLen/4 sample */
825         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
826         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
827         pSrc16[i1 * 2U] = out1;
828         pSrc16[(i1 * 2U) + 1U] = out2;
829 
830         /*  Butterfly calculations */
831 
832         /* Read yd (real), xd(imag) input */
833         U0 = pSrc16[i3 * 2U];
834         U1 = pSrc16[(i3 * 2U) + 1U];
835 
836         /* T0 = yb-yd, T1 = xb-xd */
837         T0 = __SSAT(T0 - U0, 16);
838         T1 = __SSAT(T1 - U1, 16);
839 
840         /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
841         R0 = (S0 >> 1U) - (T1 >> 1U);
842         R1 = (S1 >> 1U) + (T0 >> 1U);
843 
844         /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
845         S0 = (S0 >> 1U) + (T1 >> 1U);
846         S1 = (S1 >> 1U) - (T0 >> 1U);
847 
848         /*  Butterfly process for the i0+fftLen/2 sample */
849         out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
850 
851         out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
852 
853         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
854         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
855         pSrc16[i2 * 2U] = out1;
856         pSrc16[(i2 * 2U) + 1U] = out2;
857 
858         /*  Butterfly process for the i0+3fftLen/4 sample */
859         out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
860 
861         out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
862         /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
863         /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
864         pSrc16[i3 * 2U] = out1;
865         pSrc16[(i3 * 2U) + 1U] = out2;
866       }
867     }
868     /*  Twiddle coefficients index modifier */
869     twidCoefModifier <<= 2U;
870   }
871   /* end of middle stage process */
872 
873 
874   /* data is in 10.6(q6) format for the 1024 point */
875   /* data is in 8.8(q8) format for the 256 point */
876   /* data is in 6.10(q10) format for the 64 point */
877   /* data is in 4.12(q12) format for the 16 point */
878 
879   /*  Initializations for the last stage */
880   n1 = n2;
881   n2 >>= 2U;
882 
883   /* start of last stage process */
884 
885   /*  Butterfly implementation */
886   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
887   {
888     /*  index calculation for the input as, */
889     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
890     i1 = i0 + n2;
891     i2 = i1 + n2;
892     i3 = i2 + n2;
893 
894     /*  Reading i0, i0+fftLen/2 inputs */
895     /* Read ya (real), xa(imag) input */
896     T0 = pSrc16[i0 * 2U];
897     T1 = pSrc16[(i0 * 2U) + 1U];
898 
899     /* Read yc (real), xc(imag) input */
900     S0 = pSrc16[i2 * 2U];
901     S1 = pSrc16[(i2 * 2U) + 1U];
902 
903     /* R0 = (ya + yc), R1 = (xa + xc) */
904     R0 = __SSAT(T0 + S0, 16U);
905     R1 = __SSAT(T1 + S1, 16U);
906 
907     /* S0 = (ya - yc), S1 = (xa - xc) */
908     S0 = __SSAT(T0 - S0, 16U);
909     S1 = __SSAT(T1 - S1, 16U);
910 
911     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
912     /* Read yb (real), xb(imag) input */
913     T0 = pSrc16[i1 * 2U];
914     T1 = pSrc16[(i1 * 2U) + 1U];
915     /* Read yd (real), xd(imag) input */
916     U0 = pSrc16[i3 * 2U];
917     U1 = pSrc16[(i3 * 2U) + 1U];
918 
919     /* T0 = (yb + yd), T1 = (xb + xd)) */
920     T0 = __SSAT(T0 + U0, 16U);
921     T1 = __SSAT(T1 + U1, 16U);
922 
923     /*  writing the butterfly processed i0 sample */
924     /* xa' = xa + xb + xc + xd */
925     /* ya' = ya + yb + yc + yd */
926     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
927     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
928 
929     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
930     R0 = (R0 >> 1U) - (T0 >> 1U);
931     R1 = (R1 >> 1U) - (T1 >> 1U);
932     /* Read yb (real), xb(imag) input */
933     T0 = pSrc16[i1 * 2U];
934     T1 = pSrc16[(i1 * 2U) + 1U];
935 
936     /*  writing the butterfly processed i0 + fftLen/4 sample */
937     /* xc' = (xa-xb+xc-xd) */
938     /* yc' = (ya-yb+yc-yd) */
939     pSrc16[i1 * 2U] = R0;
940     pSrc16[(i1 * 2U) + 1U] = R1;
941 
942     /* Read yd (real), xd(imag) input */
943     U0 = pSrc16[i3 * 2U];
944     U1 = pSrc16[(i3 * 2U) + 1U];
945     /* T0 = (yb - yd), T1 = (xb - xd)  */
946     T0 = __SSAT(T0 - U0, 16U);
947     T1 = __SSAT(T1 - U1, 16U);
948 
949     /*  writing the butterfly processed i0 + fftLen/2 sample */
950     /* xb' = (xa+yb-xc-yd) */
951     /* yb' = (ya-xb-yc+xd) */
952     pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
953     pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
954 
955     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
956     /* xd' = (xa-yb-xc+yd) */
957     /* yd' = (ya+xb-yc-xd) */
958     pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
959     pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
960 
961   }
962 
963   /* end of last stage process */
964 
965   /* output is in 11.5(q5) format for the 1024 point */
966   /* output is in 9.7(q7) format for the 256 point   */
967   /* output is in 7.9(q9) format for the 64 point  */
968   /* output is in 5.11(q11) format for the 16 point  */
969 
970 #endif /* #if defined (ARM_MATH_DSP) */
971 
972 }
973 
974 
975 /**
976   @brief         Core function for the Q15 CIFFT butterfly process.
977   @param[in,out] pSrc16           points to the in-place buffer of Q15 data type
978   @param[in]     fftLen           length of the FFT
979   @param[in]     pCoef16          points to twiddle coefficient buffer
980   @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
981  */
982 
983 /*
984  * Radix-4 IFFT algorithm used is :
985  *
986  * CIFFT uses same twiddle coefficients as CFFT function
987  *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
988  *
989  *
990  * IFFT is implemented with following changes in equations from FFT
991  *
992  * Input real and imaginary data:
993  * x(n) = xa + j * ya
994  * x(n+N/4 ) = xb + j * yb
995  * x(n+N/2 ) = xc + j * yc
996  * x(n+3N 4) = xd + j * yd
997  *
998  *
999  * Output real and imaginary data:
1000  * x(4r) = xa'+ j * ya'
1001  * x(4r+1) = xb'+ j * yb'
1002  * x(4r+2) = xc'+ j * yc'
1003  * x(4r+3) = xd'+ j * yd'
1004  *
1005  *
1006  * Twiddle factors for radix-4 IFFT:
1007  * Wn = co1 + j * (si1)
1008  * W2n = co2 + j * (si2)
1009  * W3n = co3 + j * (si3)
1010 
1011  * The real and imaginary output values for the radix-4 butterfly are
1012  * xa' = xa + xb + xc + xd
1013  * ya' = ya + yb + yc + yd
1014  * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
1015  * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
1016  * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
1017  * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
1018  * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
1019  * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
1020  *
1021  */
1022 
arm_radix4_butterfly_inverse_q15(q15_t * pSrc16,uint32_t fftLen,const q15_t * pCoef16,uint32_t twidCoefModifier)1023 void arm_radix4_butterfly_inverse_q15(
1024         q15_t * pSrc16,
1025         uint32_t fftLen,
1026   const q15_t * pCoef16,
1027         uint32_t twidCoefModifier)
1028 {
1029 
1030 #if defined (ARM_MATH_DSP)
1031 
1032         q31_t R, S, T, U;
1033         q31_t C1, C2, C3, out1, out2;
1034         uint32_t n1, n2, ic, i0, j, k;
1035 
1036         q15_t *ptr1;
1037         q15_t *pSi0;
1038         q15_t *pSi1;
1039         q15_t *pSi2;
1040         q15_t *pSi3;
1041 
1042         q31_t xaya, xbyb, xcyc, xdyd;
1043 
1044   /* Total process is divided into three stages */
1045 
1046   /* process first stage, middle stages, & last stage */
1047 
1048   /*  Initializations for the first stage */
1049   n2 = fftLen;
1050   n1 = n2;
1051 
1052   /* n2 = fftLen/4 */
1053   n2 >>= 2U;
1054 
1055   /* Index for twiddle coefficient */
1056   ic = 0U;
1057 
1058   /* Index for input read and output write */
1059   j = n2;
1060 
1061   pSi0 = pSrc16;
1062   pSi1 = pSi0 + 2 * n2;
1063   pSi2 = pSi1 + 2 * n2;
1064   pSi3 = pSi2 + 2 * n2;
1065 
1066   /* Input is in 1.15(q15) format */
1067 
1068   /*  start of first stage process */
1069   do
1070   {
1071     /*  Butterfly implementation */
1072 
1073     /*  Reading i0, i0+fftLen/2 inputs */
1074     /* Read ya (real), xa(imag) input */
1075     T = read_q15x2 (pSi0);
1076     T = __SHADD16(T, 0);
1077     T = __SHADD16(T, 0);
1078 
1079     /* Read yc (real), xc(imag) input */
1080     S = read_q15x2 (pSi2);
1081     S = __SHADD16(S, 0);
1082     S = __SHADD16(S, 0);
1083 
1084     /* R = packed((ya + yc), (xa + xc) ) */
1085     R = __QADD16(T, S);
1086 
1087     /* S = packed((ya - yc), (xa - xc) ) */
1088     S = __QSUB16(T, S);
1089 
1090     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1091     /* Read yb (real), xb(imag) input */
1092     T = read_q15x2 (pSi1);
1093     T = __SHADD16(T, 0);
1094     T = __SHADD16(T, 0);
1095 
1096     /* Read yd (real), xd(imag) input */
1097     U = read_q15x2 (pSi3);
1098     U = __SHADD16(U, 0);
1099     U = __SHADD16(U, 0);
1100 
1101     /* T = packed((yb + yd), (xb + xd) ) */
1102     T = __QADD16(T, U);
1103 
1104     /*  writing the butterfly processed i0 sample */
1105     /* xa' = xa + xb + xc + xd */
1106     /* ya' = ya + yb + yc + yd */
1107     write_q15x2_ia (&pSi0, __SHADD16(R, T));
1108 
1109     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
1110     R = __QSUB16(R, T);
1111 
1112     /* co2 & si2 are read from SIMD Coefficient pointer */
1113     C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
1114 
1115 #ifndef ARM_MATH_BIG_ENDIAN
1116     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1117     out1 = __SMUSD(C2, R) >> 16U;
1118     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1119     out2 = __SMUADX(C2, R);
1120 #else
1121     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1122     out1 = __SMUADX(C2, R) >> 16U;
1123     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1124     out2 = __SMUSD(__QSUB16(0, C2), R);
1125 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1126 
1127     /*  Reading i0+fftLen/4 */
1128     /* T = packed(yb, xb) */
1129     T = read_q15x2 (pSi1);
1130     T = __SHADD16(T, 0);
1131     T = __SHADD16(T, 0);
1132 
1133     /* writing the butterfly processed i0 + fftLen/4 sample */
1134     /* writing output(xc', yc') in little endian format */
1135     write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
1136 
1137     /*  Butterfly calculations */
1138     /* U = packed(yd, xd) */
1139     U = read_q15x2 (pSi3);
1140     U = __SHADD16(U, 0);
1141     U = __SHADD16(U, 0);
1142 
1143     /* T = packed(yb-yd, xb-xd) */
1144     T = __QSUB16(T, U);
1145 
1146 #ifndef ARM_MATH_BIG_ENDIAN
1147     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1148     R = __QSAX(S, T);
1149     /* S = packed((ya-yc) + (xb- xd),  (xa-xc) - (yb-yd)) */
1150     S = __QASX(S, T);
1151 #else
1152     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1153     R = __QASX(S, T);
1154     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1155     S = __QSAX(S, T);
1156 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1157 
1158     /* co1 & si1 are read from SIMD Coefficient pointer */
1159     C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
1160     /*  Butterfly process for the i0+fftLen/2 sample */
1161 
1162 #ifndef ARM_MATH_BIG_ENDIAN
1163     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1164     out1 = __SMUSD(C1, S) >> 16U;
1165     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1166     out2 = __SMUADX(C1, S);
1167 #else
1168     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1169     out1 = __SMUADX(C1, S) >> 16U;
1170     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1171     out2 = __SMUSD(__QSUB16(0, C1), S);
1172 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1173 
1174     /* writing output(xb', yb') in little endian format */
1175     write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
1176 
1177     /* co3 & si3 are read from SIMD Coefficient pointer */
1178     C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
1179     /*  Butterfly process for the i0+3fftLen/4 sample */
1180 
1181 #ifndef ARM_MATH_BIG_ENDIAN
1182     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1183     out1 = __SMUSD(C3, R) >> 16U;
1184     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1185     out2 = __SMUADX(C3, R);
1186 #else
1187     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1188     out1 = __SMUADX(C3, R) >> 16U;
1189     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1190     out2 = __SMUSD(__QSUB16(0, C3), R);
1191 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1192 
1193     /* writing output(xd', yd') in little endian format */
1194     write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
1195 
1196     /*  Twiddle coefficients index modifier */
1197     ic = ic + twidCoefModifier;
1198 
1199   } while (--j);
1200   /* data is in 4.11(q11) format */
1201 
1202   /* end of first stage process */
1203 
1204 
1205   /* start of middle stage process */
1206 
1207   /*  Twiddle coefficients index modifier */
1208   twidCoefModifier <<= 2U;
1209 
1210   /*  Calculation of Middle stage */
1211   for (k = fftLen / 4U; k > 4U; k >>= 2U)
1212   {
1213     /*  Initializations for the middle stage */
1214     n1 = n2;
1215     n2 >>= 2U;
1216     ic = 0U;
1217 
1218     for (j = 0U; j <= (n2 - 1U); j++)
1219     {
1220       /*  index calculation for the coefficients */
1221       C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
1222       C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
1223       C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
1224 
1225       /*  Twiddle coefficients index modifier */
1226       ic = ic + twidCoefModifier;
1227 
1228       pSi0 = pSrc16 + 2 * j;
1229       pSi1 = pSi0 + 2 * n2;
1230       pSi2 = pSi1 + 2 * n2;
1231       pSi3 = pSi2 + 2 * n2;
1232 
1233       /*  Butterfly implementation */
1234       for (i0 = j; i0 < fftLen; i0 += n1)
1235       {
1236         /*  Reading i0, i0+fftLen/2 inputs */
1237         /* Read ya (real), xa(imag) input */
1238         T = read_q15x2 (pSi0);
1239 
1240         /* Read yc (real), xc(imag) input */
1241         S = read_q15x2 (pSi2);
1242 
1243         /* R = packed( (ya + yc), (xa + xc)) */
1244         R = __QADD16(T, S);
1245 
1246         /* S = packed((ya - yc), (xa - xc)) */
1247         S = __QSUB16(T, S);
1248 
1249         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1250         /* Read yb (real), xb(imag) input */
1251         T = read_q15x2 (pSi1);
1252 
1253         /* Read yd (real), xd(imag) input */
1254         U = read_q15x2 (pSi3);
1255 
1256         /* T = packed( (yb + yd), (xb + xd)) */
1257         T = __QADD16(T, U);
1258 
1259         /*  writing the butterfly processed i0 sample */
1260 
1261         /* xa' = xa + xb + xc + xd */
1262         /* ya' = ya + yb + yc + yd */
1263         out1 = __SHADD16(R, T);
1264         out1 = __SHADD16(out1, 0);
1265         write_q15x2 (pSi0, out1);
1266         pSi0 += 2 * n1;
1267 
1268         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
1269         R = __SHSUB16(R, T);
1270 
1271 #ifndef ARM_MATH_BIG_ENDIAN
1272         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1273         out1 = __SMUSD(C2, R) >> 16U;
1274 
1275         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1276         out2 = __SMUADX(C2, R);
1277 #else
1278         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1279         out1 = __SMUADX(R, C2) >> 16U;
1280 
1281         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1282         out2 = __SMUSD(__QSUB16(0, C2), R);
1283 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1284 
1285         /*  Reading i0+3fftLen/4 */
1286         /* Read yb (real), xb(imag) input */
1287         T = read_q15x2 (pSi1);
1288 
1289         /*  writing the butterfly processed i0 + fftLen/4 sample */
1290         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1291         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1292         write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
1293         pSi1 += 2 * n1;
1294 
1295         /*  Butterfly calculations */
1296 
1297         /* Read yd (real), xd(imag) input */
1298         U = read_q15x2 (pSi3);
1299 
1300         /* T = packed(yb-yd, xb-xd) */
1301         T = __QSUB16(T, U);
1302 
1303 #ifndef ARM_MATH_BIG_ENDIAN
1304         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1305         R = __SHSAX(S, T);
1306 
1307         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1308         S = __SHASX(S, T);
1309 
1310         /*  Butterfly process for the i0+fftLen/2 sample */
1311         out1 = __SMUSD(C1, S) >> 16U;
1312         out2 = __SMUADX(C1, S);
1313 #else
1314         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1315         R = __SHASX(S, T);
1316 
1317         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1318         S = __SHSAX(S, T);
1319 
1320         /*  Butterfly process for the i0+fftLen/2 sample */
1321         out1 = __SMUADX(S, C1) >> 16U;
1322         out2 = __SMUSD(__QSUB16(0, C1), S);
1323 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1324 
1325         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1326         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1327         write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
1328         pSi2 += 2 * n1;
1329 
1330         /*  Butterfly process for the i0+3fftLen/4 sample */
1331 
1332 #ifndef ARM_MATH_BIG_ENDIAN
1333         out1 = __SMUSD(C3, R) >> 16U;
1334         out2 = __SMUADX(C3, R);
1335 #else
1336         out1 = __SMUADX(C3, R) >> 16U;
1337         out2 = __SMUSD(__QSUB16(0, C3), R);
1338 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1339 
1340         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1341         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1342         write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
1343         pSi3 += 2 * n1;
1344       }
1345     }
1346     /*  Twiddle coefficients index modifier */
1347     twidCoefModifier <<= 2U;
1348   }
1349   /* end of middle stage process */
1350 
1351   /* data is in 10.6(q6) format for the 1024 point */
1352   /* data is in 8.8(q8) format for the 256 point */
1353   /* data is in 6.10(q10) format for the 64 point */
1354   /* data is in 4.12(q12) format for the 16 point */
1355 
1356   /*  Initializations for the last stage */
1357   j = fftLen >> 2;
1358 
1359   ptr1 = &pSrc16[0];
1360 
1361   /* start of last stage process */
1362 
1363   /*  Butterfly implementation */
1364   do
1365   {
1366     /* Read xa (real), ya(imag) input */
1367     xaya = read_q15x2_ia (&ptr1);
1368 
1369     /* Read xb (real), yb(imag) input */
1370     xbyb = read_q15x2_ia (&ptr1);
1371 
1372     /* Read xc (real), yc(imag) input */
1373     xcyc = read_q15x2_ia (&ptr1);
1374 
1375     /* Read xd (real), yd(imag) input */
1376     xdyd = read_q15x2_ia (&ptr1);
1377 
1378     /* R = packed((ya + yc), (xa + xc)) */
1379     R = __QADD16(xaya, xcyc);
1380 
1381     /* T = packed((yb + yd), (xb + xd)) */
1382     T = __QADD16(xbyb, xdyd);
1383 
1384     /* pointer updation for writing */
1385     ptr1 = ptr1 - 8U;
1386 
1387 
1388     /* xa' = xa + xb + xc + xd */
1389     /* ya' = ya + yb + yc + yd */
1390     write_q15x2_ia (&ptr1, __SHADD16(R, T));
1391 
1392     /* T = packed((yb + yd), (xb + xd)) */
1393     T = __QADD16(xbyb, xdyd);
1394 
1395     /* xc' = (xa-xb+xc-xd) */
1396     /* yc' = (ya-yb+yc-yd) */
1397     write_q15x2_ia (&ptr1, __SHSUB16(R, T));
1398 
1399     /* S = packed((ya - yc), (xa - xc)) */
1400     S = __QSUB16(xaya, xcyc);
1401 
1402     /* Read yd (real), xd(imag) input */
1403     /* T = packed( (yb - yd), (xb - xd))  */
1404     U = __QSUB16(xbyb, xdyd);
1405 
1406 #ifndef ARM_MATH_BIG_ENDIAN
1407     /* xb' = (xa+yb-xc-yd) */
1408     /* yb' = (ya-xb-yc+xd) */
1409     write_q15x2_ia (&ptr1, __SHASX(S, U));
1410 
1411     /* xd' = (xa-yb-xc+yd) */
1412     /* yd' = (ya+xb-yc-xd) */
1413     write_q15x2_ia (&ptr1, __SHSAX(S, U));
1414 #else
1415     /* xb' = (xa+yb-xc-yd) */
1416     /* yb' = (ya-xb-yc+xd) */
1417     write_q15x2_ia (&ptr1, __SHSAX(S, U));
1418 
1419     /* xd' = (xa-yb-xc+yd) */
1420     /* yd' = (ya+xb-yc-xd) */
1421     write_q15x2_ia (&ptr1, __SHASX(S, U));
1422 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1423 
1424   } while (--j);
1425 
1426   /* end of last stage  process */
1427 
1428   /* output is in 11.5(q5) format for the 1024 point */
1429   /* output is in 9.7(q7) format for the 256 point   */
1430   /* output is in 7.9(q9) format for the 64 point  */
1431   /* output is in 5.11(q11) format for the 16 point  */
1432 
1433 
1434 #else /* arm_radix4_butterfly_inverse_q15 */
1435 
1436         q15_t R0, R1, S0, S1, T0, T1, U0, U1;
1437         q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
1438         uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
1439 
1440   /* Total process is divided into three stages */
1441 
1442   /* process first stage, middle stages, & last stage */
1443 
1444   /*  Initializations for the first stage */
1445   n2 = fftLen;
1446   n1 = n2;
1447 
1448   /* n2 = fftLen/4 */
1449   n2 >>= 2U;
1450 
1451   /* Index for twiddle coefficient */
1452   ic = 0U;
1453 
1454   /* Index for input read and output write */
1455   i0 = 0U;
1456 
1457   j = n2;
1458 
1459   /* Input is in 1.15(q15) format */
1460 
1461   /*  Start of first stage process */
1462   do
1463   {
1464     /*  Butterfly implementation */
1465 
1466     /*  index calculation for the input as, */
1467     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1468     i1 = i0 + n2;
1469     i2 = i1 + n2;
1470     i3 = i2 + n2;
1471 
1472     /*  Reading i0, i0+fftLen/2 inputs */
1473     /* input is down scale by 4 to avoid overflow */
1474     /* Read ya (real), xa(imag) input */
1475     T0 = pSrc16[i0 * 2U] >> 2U;
1476     T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
1477     /* input is down scale by 4 to avoid overflow */
1478     /* Read yc (real), xc(imag) input */
1479     S0 = pSrc16[i2 * 2U] >> 2U;
1480     S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
1481 
1482     /* R0 = (ya + yc), R1 = (xa + xc) */
1483     R0 = __SSAT(T0 + S0, 16U);
1484     R1 = __SSAT(T1 + S1, 16U);
1485     /* S0 = (ya - yc), S1 = (xa - xc) */
1486     S0 = __SSAT(T0 - S0, 16U);
1487     S1 = __SSAT(T1 - S1, 16U);
1488 
1489     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1490     /* input is down scale by 4 to avoid overflow */
1491     /* Read yb (real), xb(imag) input */
1492     T0 = pSrc16[i1 * 2U] >> 2U;
1493     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
1494     /* Read yd (real), xd(imag) input */
1495     /* input is down scale by 4 to avoid overflow */
1496     U0 = pSrc16[i3 * 2U] >> 2U;
1497     U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
1498 
1499     /* T0 = (yb + yd), T1 = (xb + xd) */
1500     T0 = __SSAT(T0 + U0, 16U);
1501     T1 = __SSAT(T1 + U1, 16U);
1502 
1503     /*  writing the butterfly processed i0 sample */
1504     /* xa' = xa + xb + xc + xd */
1505     /* ya' = ya + yb + yc + yd */
1506     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
1507     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
1508 
1509     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
1510     R0 = __SSAT(R0 - T0, 16U);
1511     R1 = __SSAT(R1 - T1, 16U);
1512     /* co2 & si2 are read from Coefficient pointer */
1513     Co2 = pCoef16[2U * ic * 2U];
1514     Si2 = pCoef16[(2U * ic * 2U) + 1U];
1515     /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1516     out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
1517     /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1518     out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
1519 
1520     /*  Reading i0+fftLen/4 */
1521     /* input is down scale by 4 to avoid overflow */
1522     /* T0 = yb, T1 = xb */
1523     T0 = pSrc16[i1 * 2U] >> 2U;
1524     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
1525 
1526     /* writing the butterfly processed i0 + fftLen/4 sample */
1527     /* writing output(xc', yc') in little endian format */
1528     pSrc16[i1 * 2U] = out1;
1529     pSrc16[(i1 * 2U) + 1U] = out2;
1530 
1531     /*  Butterfly calculations */
1532     /* input is down scale by 4 to avoid overflow */
1533     /* U0 = yd, U1 = xd) */
1534     U0 = pSrc16[i3 * 2U] >> 2U;
1535     U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
1536 
1537     /* T0 = yb-yd, T1 = xb-xd) */
1538     T0 = __SSAT(T0 - U0, 16U);
1539     T1 = __SSAT(T1 - U1, 16U);
1540     /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1541     R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
1542     R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
1543     /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1544     S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
1545     S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
1546 
1547     /* co1 & si1 are read from Coefficient pointer */
1548     Co1 = pCoef16[ic * 2U];
1549     Si1 = pCoef16[(ic * 2U) + 1U];
1550     /*  Butterfly process for the i0+fftLen/2 sample */
1551     /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1552     out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
1553     /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1554     out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
1555     /* writing output(xb', yb') in little endian format */
1556     pSrc16[i2 * 2U] = out1;
1557     pSrc16[(i2 * 2U) + 1U] = out2;
1558 
1559     /* Co3 & si3 are read from Coefficient pointer */
1560     Co3 = pCoef16[3U * ic * 2U];
1561     Si3 = pCoef16[(3U * ic * 2U) + 1U];
1562     /*  Butterfly process for the i0+3fftLen/4 sample */
1563     /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1564     out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
1565     /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1566     out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
1567     /* writing output(xd', yd') in little endian format */
1568     pSrc16[i3 * 2U] = out1;
1569     pSrc16[(i3 * 2U) + 1U] = out2;
1570 
1571     /*  Twiddle coefficients index modifier */
1572     ic = ic + twidCoefModifier;
1573 
1574     /*  Updating input index */
1575     i0 = i0 + 1U;
1576 
1577   } while (--j);
1578 
1579   /*  End of first stage process */
1580 
1581   /* data is in 4.11(q11) format */
1582 
1583 
1584   /*  Start of Middle stage process */
1585 
1586   /*  Twiddle coefficients index modifier */
1587   twidCoefModifier <<= 2U;
1588 
1589   /*  Calculation of Middle stage */
1590   for (k = fftLen / 4U; k > 4U; k >>= 2U)
1591   {
1592     /*  Initializations for the middle stage */
1593     n1 = n2;
1594     n2 >>= 2U;
1595     ic = 0U;
1596 
1597     for (j = 0U; j <= (n2 - 1U); j++)
1598     {
1599       /*  index calculation for the coefficients */
1600       Co1 = pCoef16[ic * 2U];
1601       Si1 = pCoef16[(ic * 2U) + 1U];
1602       Co2 = pCoef16[2U * ic * 2U];
1603       Si2 = pCoef16[2U * ic * 2U + 1U];
1604       Co3 = pCoef16[3U * ic * 2U];
1605       Si3 = pCoef16[(3U * ic * 2U) + 1U];
1606 
1607       /*  Twiddle coefficients index modifier */
1608       ic = ic + twidCoefModifier;
1609 
1610       /*  Butterfly implementation */
1611       for (i0 = j; i0 < fftLen; i0 += n1)
1612       {
1613         /*  index calculation for the input as, */
1614         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1615         i1 = i0 + n2;
1616         i2 = i1 + n2;
1617         i3 = i2 + n2;
1618 
1619         /*  Reading i0, i0+fftLen/2 inputs */
1620         /* Read ya (real), xa(imag) input */
1621         T0 = pSrc16[i0 * 2U];
1622         T1 = pSrc16[(i0 * 2U) + 1U];
1623 
1624         /* Read yc (real), xc(imag) input */
1625         S0 = pSrc16[i2 * 2U];
1626         S1 = pSrc16[(i2 * 2U) + 1U];
1627 
1628 
1629         /* R0 = (ya + yc), R1 = (xa + xc) */
1630         R0 = __SSAT(T0 + S0, 16U);
1631         R1 = __SSAT(T1 + S1, 16U);
1632         /* S0 = (ya - yc), S1 = (xa - xc) */
1633         S0 = __SSAT(T0 - S0, 16U);
1634         S1 = __SSAT(T1 - S1, 16U);
1635 
1636         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1637         /* Read yb (real), xb(imag) input */
1638         T0 = pSrc16[i1 * 2U];
1639         T1 = pSrc16[(i1 * 2U) + 1U];
1640 
1641         /* Read yd (real), xd(imag) input */
1642         U0 = pSrc16[i3 * 2U];
1643         U1 = pSrc16[(i3 * 2U) + 1U];
1644 
1645         /* T0 = (yb + yd), T1 = (xb + xd) */
1646         T0 = __SSAT(T0 + U0, 16U);
1647         T1 = __SSAT(T1 + U1, 16U);
1648 
1649         /*  writing the butterfly processed i0 sample */
1650         /* xa' = xa + xb + xc + xd */
1651         /* ya' = ya + yb + yc + yd */
1652         pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
1653         pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
1654 
1655         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1656         R0 = (R0 >> 1U) - (T0 >> 1U);
1657         R1 = (R1 >> 1U) - (T1 >> 1U);
1658 
1659         /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
1660         out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
1661         /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1662         out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
1663 
1664         /*  Reading i0+3fftLen/4 */
1665         /* Read yb (real), xb(imag) input */
1666         T0 = pSrc16[i1 * 2U];
1667         T1 = pSrc16[(i1 * 2U) + 1U];
1668 
1669         /*  writing the butterfly processed i0 + fftLen/4 sample */
1670         /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1671         /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1672         pSrc16[i1 * 2U] = out1;
1673         pSrc16[(i1 * 2U) + 1U] = out2;
1674 
1675         /*  Butterfly calculations */
1676         /* Read yd (real), xd(imag) input */
1677         U0 = pSrc16[i3 * 2U];
1678         U1 = pSrc16[(i3 * 2U) + 1U];
1679 
1680         /* T0 = yb-yd, T1 = xb-xd) */
1681         T0 = __SSAT(T0 - U0, 16U);
1682         T1 = __SSAT(T1 - U1, 16U);
1683 
1684         /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1685         R0 = (S0 >> 1U) + (T1 >> 1U);
1686         R1 = (S1 >> 1U) - (T0 >> 1U);
1687 
1688         /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1689         S0 = (S0 >> 1U) - (T1 >> 1U);
1690         S1 = (S1 >> 1U) + (T0 >> 1U);
1691 
1692         /*  Butterfly process for the i0+fftLen/2 sample */
1693         out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
1694         out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
1695         /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1696         /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1697         pSrc16[i2 * 2U] = out1;
1698         pSrc16[(i2 * 2U) + 1U] = out2;
1699 
1700         /*  Butterfly process for the i0+3fftLen/4 sample */
1701         out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
1702 
1703         out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
1704         /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1705         /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1706         pSrc16[i3 * 2U] = out1;
1707         pSrc16[(i3 * 2U) + 1U] = out2;
1708 
1709 
1710       }
1711     }
1712     /*  Twiddle coefficients index modifier */
1713     twidCoefModifier <<= 2U;
1714   }
1715   /*  End of Middle stages process */
1716 
1717 
1718   /* data is in 10.6(q6) format for the 1024 point */
1719   /* data is in 8.8(q8) format for the 256 point   */
1720   /* data is in 6.10(q10) format for the 64 point  */
1721   /* data is in 4.12(q12) format for the 16 point  */
1722 
1723   /* start of last stage process */
1724 
1725 
1726   /*  Initializations for the last stage */
1727   n1 = n2;
1728   n2 >>= 2U;
1729 
1730   /*  Butterfly implementation */
1731   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
1732   {
1733     /*  index calculation for the input as, */
1734     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1735     i1 = i0 + n2;
1736     i2 = i1 + n2;
1737     i3 = i2 + n2;
1738 
1739     /*  Reading i0, i0+fftLen/2 inputs */
1740     /* Read ya (real), xa(imag) input */
1741     T0 = pSrc16[i0 * 2U];
1742     T1 = pSrc16[(i0 * 2U) + 1U];
1743     /* Read yc (real), xc(imag) input */
1744     S0 = pSrc16[i2 * 2U];
1745     S1 = pSrc16[(i2 * 2U) + 1U];
1746 
1747     /* R0 = (ya + yc), R1 = (xa + xc) */
1748     R0 = __SSAT(T0 + S0, 16U);
1749     R1 = __SSAT(T1 + S1, 16U);
1750     /* S0 = (ya - yc), S1 = (xa - xc) */
1751     S0 = __SSAT(T0 - S0, 16U);
1752     S1 = __SSAT(T1 - S1, 16U);
1753 
1754     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1755     /* Read yb (real), xb(imag) input */
1756     T0 = pSrc16[i1 * 2U];
1757     T1 = pSrc16[(i1 * 2U) + 1U];
1758     /* Read yd (real), xd(imag) input */
1759     U0 = pSrc16[i3 * 2U];
1760     U1 = pSrc16[(i3 * 2U) + 1U];
1761 
1762     /* T0 = (yb + yd), T1 = (xb + xd) */
1763     T0 = __SSAT(T0 + U0, 16U);
1764     T1 = __SSAT(T1 + U1, 16U);
1765 
1766     /*  writing the butterfly processed i0 sample */
1767     /* xa' = xa + xb + xc + xd */
1768     /* ya' = ya + yb + yc + yd */
1769     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
1770     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
1771 
1772     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1773     R0 = (R0 >> 1U) - (T0 >> 1U);
1774     R1 = (R1 >> 1U) - (T1 >> 1U);
1775 
1776     /* Read yb (real), xb(imag) input */
1777     T0 = pSrc16[i1 * 2U];
1778     T1 = pSrc16[(i1 * 2U) + 1U];
1779 
1780     /*  writing the butterfly processed i0 + fftLen/4 sample */
1781     /* xc' = (xa-xb+xc-xd) */
1782     /* yc' = (ya-yb+yc-yd) */
1783     pSrc16[i1 * 2U] = R0;
1784     pSrc16[(i1 * 2U) + 1U] = R1;
1785 
1786     /* Read yd (real), xd(imag) input */
1787     U0 = pSrc16[i3 * 2U];
1788     U1 = pSrc16[(i3 * 2U) + 1U];
1789     /* T0 = (yb - yd), T1 = (xb - xd) */
1790     T0 = __SSAT(T0 - U0, 16U);
1791     T1 = __SSAT(T1 - U1, 16U);
1792 
1793     /*  writing the butterfly processed i0 + fftLen/2 sample */
1794     /* xb' = (xa-yb-xc+yd) */
1795     /* yb' = (ya+xb-yc-xd) */
1796     pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
1797     pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
1798 
1799 
1800     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
1801     /* xd' = (xa+yb-xc-yd) */
1802     /* yd' = (ya-xb-yc+xd) */
1803     pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
1804     pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
1805   }
1806   /* end of last stage  process */
1807 
1808   /* output is in 11.5(q5) format for the 1024 point */
1809   /* output is in 9.7(q7) format for the 256 point   */
1810   /* output is in 7.9(q9) format for the 64 point  */
1811   /* output is in 5.11(q11) format for the 16 point  */
1812 
1813 #endif /* #if defined (ARM_MATH_DSP) */
1814 
1815 }
1816