1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_radix4_q15.c
4  * Description:  This file has function definition of Radix-4 FFT & IFFT function and
5  *               In-place bit reversal using bit reversal table
6  *
7  * $Date:        23 April 2021
8  * $Revision:    V1.9.0
9  *
10  * Target Processor: Cortex-M and Cortex-A cores
11  * -------------------------------------------------------------------- */
12 /*
13  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
14  *
15  * SPDX-License-Identifier: Apache-2.0
16  *
17  * Licensed under the Apache License, Version 2.0 (the License); you may
18  * not use this file except in compliance with the License.
19  * You may obtain a copy of the License at
20  *
21  * www.apache.org/licenses/LICENSE-2.0
22  *
23  * Unless required by applicable law or agreed to in writing, software
24  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26  * See the License for the specific language governing permissions and
27  * limitations under the License.
28  */
29 
30 #include "dsp/transform_functions.h"
31 
32 
33 void arm_radix4_butterfly_q15(
34         q15_t * pSrc16,
35         uint32_t fftLen,
36   const q15_t * pCoef16,
37         uint32_t twidCoefModifier);
38 
39 void arm_radix4_butterfly_inverse_q15(
40         q15_t * pSrc16,
41         uint32_t fftLen,
42   const q15_t * pCoef16,
43         uint32_t twidCoefModifier);
44 
45 void arm_bitreversal_q15(
46         q15_t * pSrc,
47         uint32_t fftLen,
48         uint16_t bitRevFactor,
49   const uint16_t * pBitRevTab);
50 
51 /**
52   @ingroup groupTransforms
53  */
54 
55 /**
56   @addtogroup ComplexFFT
57   @{
58  */
59 
60 
61 /**
62   @brief               Processing function for the Q15 CFFT/CIFFT.
63   @deprecated          Do not use this function.  It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
64   @param[in]     S     points to an instance of the Q15 CFFT/CIFFT structure.
65   @param[in,out] pSrc  points to the complex data buffer. Processing occurs in-place.
66   @return        none
67 
68   @par Input and output formats:
69                  Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
70                  Hence the output format is different for different FFT sizes.
71                  The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
72   @par
73                  \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
74                  \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
75  */
76 
arm_cfft_radix4_q15(const arm_cfft_radix4_instance_q15 * S,q15_t * pSrc)77 void arm_cfft_radix4_q15(
78   const arm_cfft_radix4_instance_q15 * S,
79         q15_t * pSrc)
80 {
81   if (S->ifftFlag == 1U)
82   {
83     /*  Complex IFFT radix-4  */
84     arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
85   }
86   else
87   {
88     /*  Complex FFT radix-4  */
89     arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
90   }
91 
92   if (S->bitReverseFlag == 1U)
93   {
94     /*  Bit Reversal */
95     arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
96   }
97 
98 }
99 
100 /**
101   @} end of ComplexFFT group
102  */
103 
104 /*
105  * Radix-4 FFT algorithm used is :
106  *
107  * Input real and imaginary data:
108  * x(n) = xa + j * ya
109  * x(n+N/4 ) = xb + j * yb
110  * x(n+N/2 ) = xc + j * yc
111  * x(n+3N 4) = xd + j * yd
112  *
113  *
114  * Output real and imaginary data:
115  * x(4r) = xa'+ j * ya'
116  * x(4r+1) = xb'+ j * yb'
117  * x(4r+2) = xc'+ j * yc'
118  * x(4r+3) = xd'+ j * yd'
119  *
120  *
121  * Twiddle factors for radix-4 FFT:
122  * Wn = co1 + j * (- si1)
123  * W2n = co2 + j * (- si2)
124  * W3n = co3 + j * (- si3)
125 
126  * The real and imaginary output values for the radix-4 butterfly are
127  * xa' = xa + xb + xc + xd
128  * ya' = ya + yb + yc + yd
129  * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
130  * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
131  * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
132  * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
133  * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
134  * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
135  *
136  */
137 
138 /**
139   @brief         Core function for the Q15 CFFT butterfly process.
140   @param[in,out] pSrc16          points to the in-place buffer of Q15 data type
141   @param[in]     fftLen           length of the FFT
142   @param[in]     pCoef16         points to twiddle coefficient buffer
143   @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
144   @return        none
145  */
146 
arm_radix4_butterfly_q15(q15_t * pSrc16,uint32_t fftLen,const q15_t * pCoef16,uint32_t twidCoefModifier)147 void arm_radix4_butterfly_q15(
148         q15_t * pSrc16,
149         uint32_t fftLen,
150   const q15_t * pCoef16,
151         uint32_t twidCoefModifier)
152 {
153 
154 #if defined (ARM_MATH_DSP)
155 
156         q31_t R, S, T, U;
157         q31_t C1, C2, C3, out1, out2;
158         uint32_t n1, n2, ic, i0, j, k;
159 
160         q15_t *ptr1;
161         q15_t *pSi0;
162         q15_t *pSi1;
163         q15_t *pSi2;
164         q15_t *pSi3;
165 
166         q31_t xaya, xbyb, xcyc, xdyd;
167 
168   /* Total process is divided into three stages */
169 
170   /* process first stage, middle stages, & last stage */
171 
172   /*  Initializations for the first stage */
173   n2 = fftLen;
174   n1 = n2;
175 
176   /* n2 = fftLen/4 */
177   n2 >>= 2U;
178 
179   /* Index for twiddle coefficient */
180   ic = 0U;
181 
182   /* Index for input read and output write */
183   j = n2;
184 
185   pSi0 = pSrc16;
186   pSi1 = pSi0 + 2 * n2;
187   pSi2 = pSi1 + 2 * n2;
188   pSi3 = pSi2 + 2 * n2;
189 
190   /* Input is in 1.15(q15) format */
191 
192   /*  start of first stage process */
193   do
194   {
195     /*  Butterfly implementation */
196 
197     /* Reading i0, i0+fftLen/2 inputs */
198     /* Read ya (real), xa(imag) input */
199     T = read_q15x2 (pSi0);
200     T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
201     T = __SHADD16(T, 0); /* it turns out doing this twice is 2 cycles, the alternative takes 3 cycles */
202 /*
203     in = ((int16_t) (T & 0xFFFF)) >> 2;       // alternative code that takes 3 cycles
204      T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
205 */
206 
207     /* Read yc (real), xc(imag) input */
208     S = read_q15x2 (pSi2);
209     S = __SHADD16(S, 0);
210     S = __SHADD16(S, 0);
211 
212     /* R = packed((ya + yc), (xa + xc) ) */
213     R = __QADD16(T, S);
214 
215     /* S = packed((ya - yc), (xa - xc) ) */
216     S = __QSUB16(T, S);
217 
218     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
219     /* Read yb (real), xb(imag) input */
220     T = read_q15x2 (pSi1);
221     T = __SHADD16(T, 0);
222     T = __SHADD16(T, 0);
223 
224     /* Read yd (real), xd(imag) input */
225     U = read_q15x2 (pSi3);
226     U = __SHADD16(U, 0);
227     U = __SHADD16(U, 0);
228 
229     /* T = packed((yb + yd), (xb + xd) ) */
230     T = __QADD16(T, U);
231 
232     /*  writing the butterfly processed i0 sample */
233     /* xa' = xa + xb + xc + xd */
234     /* ya' = ya + yb + yc + yd */
235     write_q15x2_ia (&pSi0, __SHADD16(R, T));
236 
237     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
238     R = __QSUB16(R, T);
239 
240     /* co2 & si2 are read from SIMD Coefficient pointer */
241     C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
242 
243 #ifndef ARM_MATH_BIG_ENDIAN
244     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
245     out1 = __SMUAD(C2, R) >> 16U;
246     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
247     out2 = __SMUSDX(C2, R);
248 #else
249     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
250     out1 = __SMUSDX(R, C2) >> 16U;
251     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
252     out2 = __SMUAD(C2, R);
253 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
254 
255     /*  Reading i0+fftLen/4 */
256     /* T = packed(yb, xb) */
257     T = read_q15x2 (pSi1);
258     T = __SHADD16(T, 0);
259     T = __SHADD16(T, 0);
260 
261     /* writing the butterfly processed i0 + fftLen/4 sample */
262     /* writing output(xc', yc') in little endian format */
263     write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
264 
265     /*  Butterfly calculations */
266     /* U = packed(yd, xd) */
267     U = read_q15x2 (pSi3);
268     U = __SHADD16(U, 0);
269     U = __SHADD16(U, 0);
270 
271     /* T = packed(yb-yd, xb-xd) */
272     T = __QSUB16(T, U);
273 
274 #ifndef ARM_MATH_BIG_ENDIAN
275     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
276     R = __QASX(S, T);
277     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
278     S = __QSAX(S, T);
279 #else
280     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
281     R = __QSAX(S, T);
282     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
283     S = __QASX(S, T);
284 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
285 
286     /* co1 & si1 are read from SIMD Coefficient pointer */
287     C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
288     /*  Butterfly process for the i0+fftLen/2 sample */
289 
290 #ifndef ARM_MATH_BIG_ENDIAN
291     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
292     out1 = __SMUAD(C1, S) >> 16U;
293     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
294     out2 = __SMUSDX(C1, S);
295 #else
296     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
297     out1 = __SMUSDX(S, C1) >> 16U;
298     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
299     out2 = __SMUAD(C1, S);
300 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
301 
302     /* writing output(xb', yb') in little endian format */
303     write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
304 
305     /* co3 & si3 are read from SIMD Coefficient pointer */
306     C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
307     /*  Butterfly process for the i0+3fftLen/4 sample */
308 
309 #ifndef ARM_MATH_BIG_ENDIAN
310     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
311     out1 = __SMUAD(C3, R) >> 16U;
312     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
313     out2 = __SMUSDX(C3, R);
314 #else
315     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
316     out1 = __SMUSDX(R, C3) >> 16U;
317     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
318     out2 = __SMUAD(C3, R);
319 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
320 
321     /* writing output(xd', yd') in little endian format */
322     write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
323 
324     /*  Twiddle coefficients index modifier */
325     ic = ic + twidCoefModifier;
326 
327   } while (--j);
328   /* data is in 4.11(q11) format */
329 
330   /* end of first stage process */
331 
332 
333   /* start of middle stage process */
334 
335   /*  Twiddle coefficients index modifier */
336   twidCoefModifier <<= 2U;
337 
338   /*  Calculation of Middle stage */
339   for (k = fftLen / 4U; k > 4U; k >>= 2U)
340   {
341     /*  Initializations for the middle stage */
342     n1 = n2;
343     n2 >>= 2U;
344     ic = 0U;
345 
346     for (j = 0U; j <= (n2 - 1U); j++)
347     {
348       /*  index calculation for the coefficients */
349       C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
350       C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
351       C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
352 
353       /*  Twiddle coefficients index modifier */
354       ic = ic + twidCoefModifier;
355 
356       pSi0 = pSrc16 + 2 * j;
357       pSi1 = pSi0 + 2 * n2;
358       pSi2 = pSi1 + 2 * n2;
359       pSi3 = pSi2 + 2 * n2;
360 
361       /*  Butterfly implementation */
362       for (i0 = j; i0 < fftLen; i0 += n1)
363       {
364         /*  Reading i0, i0+fftLen/2 inputs */
365         /* Read ya (real), xa(imag) input */
366         T = read_q15x2 (pSi0);
367 
368         /* Read yc (real), xc(imag) input */
369         S = read_q15x2 (pSi2);
370 
371         /* R = packed( (ya + yc), (xa + xc)) */
372         R = __QADD16(T, S);
373 
374         /* S = packed((ya - yc), (xa - xc)) */
375         S = __QSUB16(T, S);
376 
377         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
378         /* Read yb (real), xb(imag) input */
379         T = read_q15x2 (pSi1);
380 
381         /* Read yd (real), xd(imag) input */
382         U = read_q15x2 (pSi3);
383 
384         /* T = packed( (yb + yd), (xb + xd)) */
385         T = __QADD16(T, U);
386 
387         /*  writing the butterfly processed i0 sample */
388 
389         /* xa' = xa + xb + xc + xd */
390         /* ya' = ya + yb + yc + yd */
391         out1 = __SHADD16(R, T);
392         out1 = __SHADD16(out1, 0);
393         write_q15x2 (pSi0, out1);
394         pSi0 += 2 * n1;
395 
396         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
397         R = __SHSUB16(R, T);
398 
399 #ifndef ARM_MATH_BIG_ENDIAN
400         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
401         out1 = __SMUAD(C2, R) >> 16U;
402 
403         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
404         out2 = __SMUSDX(C2, R);
405 #else
406         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
407         out1 = __SMUSDX(R, C2) >> 16U;
408 
409         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
410         out2 = __SMUAD(C2, R);
411 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
412 
413         /*  Reading i0+3fftLen/4 */
414         /* Read yb (real), xb(imag) input */
415         T = read_q15x2 (pSi1);
416 
417         /*  writing the butterfly processed i0 + fftLen/4 sample */
418         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
419         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
420         write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
421         pSi1 += 2 * n1;
422 
423         /*  Butterfly calculations */
424 
425         /* Read yd (real), xd(imag) input */
426         U = read_q15x2 (pSi3);
427 
428         /* T = packed(yb-yd, xb-xd) */
429         T = __QSUB16(T, U);
430 
431 #ifndef ARM_MATH_BIG_ENDIAN
432         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
433         R = __SHASX(S, T);
434 
435         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
436         S = __SHSAX(S, T);
437 
438 
439         /*  Butterfly process for the i0+fftLen/2 sample */
440         out1 = __SMUAD(C1, S) >> 16U;
441         out2 = __SMUSDX(C1, S);
442 #else
443         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
444         R = __SHSAX(S, T);
445 
446         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
447         S = __SHASX(S, T);
448 
449 
450         /*  Butterfly process for the i0+fftLen/2 sample */
451         out1 = __SMUSDX(S, C1) >> 16U;
452         out2 = __SMUAD(C1, S);
453 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
454 
455         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
456         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
457         write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
458         pSi2 += 2 * n1;
459 
460         /*  Butterfly process for the i0+3fftLen/4 sample */
461 
462 #ifndef ARM_MATH_BIG_ENDIAN
463         out1 = __SMUAD(C3, R) >> 16U;
464         out2 = __SMUSDX(C3, R);
465 #else
466         out1 = __SMUSDX(R, C3) >> 16U;
467         out2 = __SMUAD(C3, R);
468 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
469 
470         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
471         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
472         write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
473         pSi3 += 2 * n1;
474       }
475     }
476     /*  Twiddle coefficients index modifier */
477     twidCoefModifier <<= 2U;
478   }
479   /* end of middle stage process */
480 
481 
482   /* data is in 10.6(q6) format for the 1024 point */
483   /* data is in 8.8(q8) format for the 256 point */
484   /* data is in 6.10(q10) format for the 64 point */
485   /* data is in 4.12(q12) format for the 16 point */
486 
487   /*  Initializations for the last stage */
488   j = fftLen >> 2;
489 
490   ptr1 = &pSrc16[0];
491 
492   /* start of last stage process */
493 
494   /*  Butterfly implementation */
495   do
496   {
497     /* Read xa (real), ya(imag) input */
498     xaya = read_q15x2_ia ((q15_t **) &ptr1);
499 
500     /* Read xb (real), yb(imag) input */
501     xbyb = read_q15x2_ia ((q15_t **) &ptr1);
502 
503     /* Read xc (real), yc(imag) input */
504     xcyc = read_q15x2_ia ((q15_t **) &ptr1);
505 
506     /* Read xd (real), yd(imag) input */
507     xdyd = read_q15x2_ia ((q15_t **) &ptr1);
508 
509     /* R = packed((ya + yc), (xa + xc)) */
510     R = __QADD16(xaya, xcyc);
511 
512     /* T = packed((yb + yd), (xb + xd)) */
513     T = __QADD16(xbyb, xdyd);
514 
515     /* pointer updation for writing */
516     ptr1 = ptr1 - 8U;
517 
518 
519     /* xa' = xa + xb + xc + xd */
520     /* ya' = ya + yb + yc + yd */
521     write_q15x2_ia (&ptr1, __SHADD16(R, T));
522 
523     /* T = packed((yb + yd), (xb + xd)) */
524     T = __QADD16(xbyb, xdyd);
525 
526     /* xc' = (xa-xb+xc-xd) */
527     /* yc' = (ya-yb+yc-yd) */
528     write_q15x2_ia (&ptr1, __SHSUB16(R, T));
529 
530     /* S = packed((ya - yc), (xa - xc)) */
531     S = __QSUB16(xaya, xcyc);
532 
533     /* Read yd (real), xd(imag) input */
534     /* T = packed( (yb - yd), (xb - xd))  */
535     U = __QSUB16(xbyb, xdyd);
536 
537 #ifndef ARM_MATH_BIG_ENDIAN
538     /* xb' = (xa+yb-xc-yd) */
539     /* yb' = (ya-xb-yc+xd) */
540     write_q15x2_ia (&ptr1, __SHSAX(S, U));
541 
542     /* xd' = (xa-yb-xc+yd) */
543     /* yd' = (ya+xb-yc-xd) */
544     write_q15x2_ia (&ptr1, __SHASX(S, U));
545 #else
546     /* xb' = (xa+yb-xc-yd) */
547     /* yb' = (ya-xb-yc+xd) */
548     write_q15x2_ia (&ptr1, __SHASX(S, U));
549 
550     /* xd' = (xa-yb-xc+yd) */
551     /* yd' = (ya+xb-yc-xd) */
552     write_q15x2_ia (&ptr1, __SHSAX(S, U));
553 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
554 
555   } while (--j);
556 
557   /* end of last stage process */
558 
559   /* output is in 11.5(q5) format for the 1024 point */
560   /* output is in 9.7(q7) format for the 256 point   */
561   /* output is in 7.9(q9) format for the 64 point  */
562   /* output is in 5.11(q11) format for the 16 point  */
563 
564 
565 #else /* #if defined (ARM_MATH_DSP) */
566 
567         q15_t R0, R1, S0, S1, T0, T1, U0, U1;
568         q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
569         uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
570 
571   /* Total process is divided into three stages */
572 
573   /* process first stage, middle stages, & last stage */
574 
575   /*  Initializations for the first stage */
576   n2 = fftLen;
577   n1 = n2;
578 
579   /* n2 = fftLen/4 */
580   n2 >>= 2U;
581 
582   /* Index for twiddle coefficient */
583   ic = 0U;
584 
585   /* Index for input read and output write */
586   i0 = 0U;
587   j = n2;
588 
589   /* Input is in 1.15(q15) format */
590 
591   /*  start of first stage process */
592   do
593   {
594     /*  Butterfly implementation */
595 
596     /*  index calculation for the input as, */
597     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
598     i1 = i0 + n2;
599     i2 = i1 + n2;
600     i3 = i2 + n2;
601 
602     /*  Reading i0, i0+fftLen/2 inputs */
603 
604     /* input is down scale by 4 to avoid overflow */
605     /* Read ya (real), xa(imag) input */
606     T0 = pSrc16[i0 * 2U] >> 2U;
607     T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
608 
609     /* input is down scale by 4 to avoid overflow */
610     /* Read yc (real), xc(imag) input */
611     S0 = pSrc16[i2 * 2U] >> 2U;
612     S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
613 
614     /* R0 = (ya + yc) */
615     R0 = __SSAT(T0 + S0, 16U);
616     /* R1 = (xa + xc) */
617     R1 = __SSAT(T1 + S1, 16U);
618 
619     /* S0 = (ya - yc) */
620     S0 = __SSAT(T0 - S0, 16);
621     /* S1 = (xa - xc) */
622     S1 = __SSAT(T1 - S1, 16);
623 
624     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
625     /* input is down scale by 4 to avoid overflow */
626     /* Read yb (real), xb(imag) input */
627     T0 = pSrc16[i1 * 2U] >> 2U;
628     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
629 
630     /* input is down scale by 4 to avoid overflow */
631     /* Read yd (real), xd(imag) input */
632     U0 = pSrc16[i3 * 2U] >> 2U;
633     U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
634 
635     /* T0 = (yb + yd) */
636     T0 = __SSAT(T0 + U0, 16U);
637     /* T1 = (xb + xd) */
638     T1 = __SSAT(T1 + U1, 16U);
639 
640     /*  writing the butterfly processed i0 sample */
641     /* ya' = ya + yb + yc + yd */
642     /* xa' = xa + xb + xc + xd */
643     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
644     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
645 
646     /* R0 = (ya + yc) - (yb + yd) */
647     /* R1 = (xa + xc) - (xb + xd) */
648     R0 = __SSAT(R0 - T0, 16U);
649     R1 = __SSAT(R1 - T1, 16U);
650 
651     /* co2 & si2 are read from Coefficient pointer */
652     Co2 = pCoef16[2U * ic * 2U];
653     Si2 = pCoef16[(2U * ic * 2U) + 1];
654 
655     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
656     out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
657     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
658     out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
659 
660     /*  Reading i0+fftLen/4 */
661     /* input is down scale by 4 to avoid overflow */
662     /* T0 = yb, T1 =  xb */
663     T0 = pSrc16[i1 * 2U] >> 2;
664     T1 = pSrc16[(i1 * 2U) + 1] >> 2;
665 
666     /* writing the butterfly processed i0 + fftLen/4 sample */
667     /* writing output(xc', yc') in little endian format */
668     pSrc16[i1 * 2U] = out1;
669     pSrc16[(i1 * 2U) + 1] = out2;
670 
671     /*  Butterfly calculations */
672     /* input is down scale by 4 to avoid overflow */
673     /* U0 = yd, U1 = xd */
674     U0 = pSrc16[i3 * 2U] >> 2;
675     U1 = pSrc16[(i3 * 2U) + 1] >> 2;
676     /* T0 = yb-yd */
677     T0 = __SSAT(T0 - U0, 16);
678     /* T1 = xb-xd */
679     T1 = __SSAT(T1 - U1, 16);
680 
681     /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
682     R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
683     R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
684 
685     /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
686     S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
687     S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
688 
689     /* co1 & si1 are read from Coefficient pointer */
690     Co1 = pCoef16[ic * 2U];
691     Si1 = pCoef16[(ic * 2U) + 1];
692     /*  Butterfly process for the i0+fftLen/2 sample */
693     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
694     out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
695     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
696     out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
697 
698     /* writing output(xb', yb') in little endian format */
699     pSrc16[i2 * 2U] = out1;
700     pSrc16[(i2 * 2U) + 1] = out2;
701 
702     /* Co3 & si3 are read from Coefficient pointer */
703     Co3 = pCoef16[3U * (ic * 2U)];
704     Si3 = pCoef16[(3U * (ic * 2U)) + 1];
705     /*  Butterfly process for the i0+3fftLen/4 sample */
706     /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
707     out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
708     /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
709     out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
710     /* writing output(xd', yd') in little endian format */
711     pSrc16[i3 * 2U] = out1;
712     pSrc16[(i3 * 2U) + 1] = out2;
713 
714     /*  Twiddle coefficients index modifier */
715     ic = ic + twidCoefModifier;
716 
717     /*  Updating input index */
718     i0 = i0 + 1U;
719 
720   } while (--j);
721   /* data is in 4.11(q11) format */
722 
723   /* end of first stage process */
724 
725 
726   /* start of middle stage process */
727 
728   /*  Twiddle coefficients index modifier */
729   twidCoefModifier <<= 2U;
730 
731   /*  Calculation of Middle stage */
732   for (k = fftLen / 4U; k > 4U; k >>= 2U)
733   {
734     /*  Initializations for the middle stage */
735     n1 = n2;
736     n2 >>= 2U;
737     ic = 0U;
738 
739     for (j = 0U; j <= (n2 - 1U); j++)
740     {
741       /*  index calculation for the coefficients */
742       Co1 = pCoef16[ic * 2U];
743       Si1 = pCoef16[(ic * 2U) + 1U];
744       Co2 = pCoef16[2U * (ic * 2U)];
745       Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
746       Co3 = pCoef16[3U * (ic * 2U)];
747       Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
748 
749       /*  Twiddle coefficients index modifier */
750       ic = ic + twidCoefModifier;
751 
752       /*  Butterfly implementation */
753       for (i0 = j; i0 < fftLen; i0 += n1)
754       {
755         /*  index calculation for the input as, */
756         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
757         i1 = i0 + n2;
758         i2 = i1 + n2;
759         i3 = i2 + n2;
760 
761         /*  Reading i0, i0+fftLen/2 inputs */
762         /* Read ya (real), xa(imag) input */
763         T0 = pSrc16[i0 * 2U];
764         T1 = pSrc16[(i0 * 2U) + 1U];
765 
766         /* Read yc (real), xc(imag) input */
767         S0 = pSrc16[i2 * 2U];
768         S1 = pSrc16[(i2 * 2U) + 1U];
769 
770         /* R0 = (ya + yc), R1 = (xa + xc) */
771         R0 = __SSAT(T0 + S0, 16);
772         R1 = __SSAT(T1 + S1, 16);
773 
774         /* S0 = (ya - yc), S1 =(xa - xc) */
775         S0 = __SSAT(T0 - S0, 16);
776         S1 = __SSAT(T1 - S1, 16);
777 
778         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
779         /* Read yb (real), xb(imag) input */
780         T0 = pSrc16[i1 * 2U];
781         T1 = pSrc16[(i1 * 2U) + 1U];
782 
783         /* Read yd (real), xd(imag) input */
784         U0 = pSrc16[i3 * 2U];
785         U1 = pSrc16[(i3 * 2U) + 1U];
786 
787 
788         /* T0 = (yb + yd), T1 = (xb + xd) */
789         T0 = __SSAT(T0 + U0, 16);
790         T1 = __SSAT(T1 + U1, 16);
791 
792         /*  writing the butterfly processed i0 sample */
793 
794         /* xa' = xa + xb + xc + xd */
795         /* ya' = ya + yb + yc + yd */
796         out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
797         out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
798 
799         pSrc16[i0 * 2U] = out1;
800         pSrc16[(2U * i0) + 1U] = out2;
801 
802         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
803         R0 = (R0 >> 1U) - (T0 >> 1U);
804         R1 = (R1 >> 1U) - (T1 >> 1U);
805 
806         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
807         out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
808 
809         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
810         out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
811 
812         /*  Reading i0+3fftLen/4 */
813         /* Read yb (real), xb(imag) input */
814         T0 = pSrc16[i1 * 2U];
815         T1 = pSrc16[(i1 * 2U) + 1U];
816 
817         /*  writing the butterfly processed i0 + fftLen/4 sample */
818         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
819         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
820         pSrc16[i1 * 2U] = out1;
821         pSrc16[(i1 * 2U) + 1U] = out2;
822 
823         /*  Butterfly calculations */
824 
825         /* Read yd (real), xd(imag) input */
826         U0 = pSrc16[i3 * 2U];
827         U1 = pSrc16[(i3 * 2U) + 1U];
828 
829         /* T0 = yb-yd, T1 = xb-xd */
830         T0 = __SSAT(T0 - U0, 16);
831         T1 = __SSAT(T1 - U1, 16);
832 
833         /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
834         R0 = (S0 >> 1U) - (T1 >> 1U);
835         R1 = (S1 >> 1U) + (T0 >> 1U);
836 
837         /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
838         S0 = (S0 >> 1U) + (T1 >> 1U);
839         S1 = (S1 >> 1U) - (T0 >> 1U);
840 
841         /*  Butterfly process for the i0+fftLen/2 sample */
842         out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
843 
844         out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
845 
846         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
847         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
848         pSrc16[i2 * 2U] = out1;
849         pSrc16[(i2 * 2U) + 1U] = out2;
850 
851         /*  Butterfly process for the i0+3fftLen/4 sample */
852         out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
853 
854         out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
855         /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
856         /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
857         pSrc16[i3 * 2U] = out1;
858         pSrc16[(i3 * 2U) + 1U] = out2;
859       }
860     }
861     /*  Twiddle coefficients index modifier */
862     twidCoefModifier <<= 2U;
863   }
864   /* end of middle stage process */
865 
866 
867   /* data is in 10.6(q6) format for the 1024 point */
868   /* data is in 8.8(q8) format for the 256 point */
869   /* data is in 6.10(q10) format for the 64 point */
870   /* data is in 4.12(q12) format for the 16 point */
871 
872   /*  Initializations for the last stage */
873   n1 = n2;
874   n2 >>= 2U;
875 
876   /* start of last stage process */
877 
878   /*  Butterfly implementation */
879   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
880   {
881     /*  index calculation for the input as, */
882     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
883     i1 = i0 + n2;
884     i2 = i1 + n2;
885     i3 = i2 + n2;
886 
887     /*  Reading i0, i0+fftLen/2 inputs */
888     /* Read ya (real), xa(imag) input */
889     T0 = pSrc16[i0 * 2U];
890     T1 = pSrc16[(i0 * 2U) + 1U];
891 
892     /* Read yc (real), xc(imag) input */
893     S0 = pSrc16[i2 * 2U];
894     S1 = pSrc16[(i2 * 2U) + 1U];
895 
896     /* R0 = (ya + yc), R1 = (xa + xc) */
897     R0 = __SSAT(T0 + S0, 16U);
898     R1 = __SSAT(T1 + S1, 16U);
899 
900     /* S0 = (ya - yc), S1 = (xa - xc) */
901     S0 = __SSAT(T0 - S0, 16U);
902     S1 = __SSAT(T1 - S1, 16U);
903 
904     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
905     /* Read yb (real), xb(imag) input */
906     T0 = pSrc16[i1 * 2U];
907     T1 = pSrc16[(i1 * 2U) + 1U];
908     /* Read yd (real), xd(imag) input */
909     U0 = pSrc16[i3 * 2U];
910     U1 = pSrc16[(i3 * 2U) + 1U];
911 
912     /* T0 = (yb + yd), T1 = (xb + xd)) */
913     T0 = __SSAT(T0 + U0, 16U);
914     T1 = __SSAT(T1 + U1, 16U);
915 
916     /*  writing the butterfly processed i0 sample */
917     /* xa' = xa + xb + xc + xd */
918     /* ya' = ya + yb + yc + yd */
919     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
920     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
921 
922     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
923     R0 = (R0 >> 1U) - (T0 >> 1U);
924     R1 = (R1 >> 1U) - (T1 >> 1U);
925     /* Read yb (real), xb(imag) input */
926     T0 = pSrc16[i1 * 2U];
927     T1 = pSrc16[(i1 * 2U) + 1U];
928 
929     /*  writing the butterfly processed i0 + fftLen/4 sample */
930     /* xc' = (xa-xb+xc-xd) */
931     /* yc' = (ya-yb+yc-yd) */
932     pSrc16[i1 * 2U] = R0;
933     pSrc16[(i1 * 2U) + 1U] = R1;
934 
935     /* Read yd (real), xd(imag) input */
936     U0 = pSrc16[i3 * 2U];
937     U1 = pSrc16[(i3 * 2U) + 1U];
938     /* T0 = (yb - yd), T1 = (xb - xd)  */
939     T0 = __SSAT(T0 - U0, 16U);
940     T1 = __SSAT(T1 - U1, 16U);
941 
942     /*  writing the butterfly processed i0 + fftLen/2 sample */
943     /* xb' = (xa+yb-xc-yd) */
944     /* yb' = (ya-xb-yc+xd) */
945     pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
946     pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
947 
948     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
949     /* xd' = (xa-yb-xc+yd) */
950     /* yd' = (ya+xb-yc-xd) */
951     pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
952     pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
953 
954   }
955 
956   /* end of last stage process */
957 
958   /* output is in 11.5(q5) format for the 1024 point */
959   /* output is in 9.7(q7) format for the 256 point   */
960   /* output is in 7.9(q9) format for the 64 point  */
961   /* output is in 5.11(q11) format for the 16 point  */
962 
963 #endif /* #if defined (ARM_MATH_DSP) */
964 
965 }
966 
967 
968 /**
969   @brief         Core function for the Q15 CIFFT butterfly process.
970   @param[in,out] pSrc16           points to the in-place buffer of Q15 data type
971   @param[in]     fftLen           length of the FFT
972   @param[in]     pCoef16          points to twiddle coefficient buffer
973   @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
974   @return        none
975  */
976 
977 /*
978  * Radix-4 IFFT algorithm used is :
979  *
980  * CIFFT uses same twiddle coefficients as CFFT function
981  *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
982  *
983  *
984  * IFFT is implemented with following changes in equations from FFT
985  *
986  * Input real and imaginary data:
987  * x(n) = xa + j * ya
988  * x(n+N/4 ) = xb + j * yb
989  * x(n+N/2 ) = xc + j * yc
990  * x(n+3N 4) = xd + j * yd
991  *
992  *
993  * Output real and imaginary data:
994  * x(4r) = xa'+ j * ya'
995  * x(4r+1) = xb'+ j * yb'
996  * x(4r+2) = xc'+ j * yc'
997  * x(4r+3) = xd'+ j * yd'
998  *
999  *
1000  * Twiddle factors for radix-4 IFFT:
1001  * Wn = co1 + j * (si1)
1002  * W2n = co2 + j * (si2)
1003  * W3n = co3 + j * (si3)
1004 
1005  * The real and imaginary output values for the radix-4 butterfly are
1006  * xa' = xa + xb + xc + xd
1007  * ya' = ya + yb + yc + yd
1008  * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
1009  * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
1010  * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
1011  * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
1012  * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
1013  * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
1014  *
1015  */
1016 
arm_radix4_butterfly_inverse_q15(q15_t * pSrc16,uint32_t fftLen,const q15_t * pCoef16,uint32_t twidCoefModifier)1017 void arm_radix4_butterfly_inverse_q15(
1018         q15_t * pSrc16,
1019         uint32_t fftLen,
1020   const q15_t * pCoef16,
1021         uint32_t twidCoefModifier)
1022 {
1023 
1024 #if defined (ARM_MATH_DSP)
1025 
1026         q31_t R, S, T, U;
1027         q31_t C1, C2, C3, out1, out2;
1028         uint32_t n1, n2, ic, i0, j, k;
1029 
1030         q15_t *ptr1;
1031         q15_t *pSi0;
1032         q15_t *pSi1;
1033         q15_t *pSi2;
1034         q15_t *pSi3;
1035 
1036         q31_t xaya, xbyb, xcyc, xdyd;
1037 
1038   /* Total process is divided into three stages */
1039 
1040   /* process first stage, middle stages, & last stage */
1041 
1042   /*  Initializations for the first stage */
1043   n2 = fftLen;
1044   n1 = n2;
1045 
1046   /* n2 = fftLen/4 */
1047   n2 >>= 2U;
1048 
1049   /* Index for twiddle coefficient */
1050   ic = 0U;
1051 
1052   /* Index for input read and output write */
1053   j = n2;
1054 
1055   pSi0 = pSrc16;
1056   pSi1 = pSi0 + 2 * n2;
1057   pSi2 = pSi1 + 2 * n2;
1058   pSi3 = pSi2 + 2 * n2;
1059 
1060   /* Input is in 1.15(q15) format */
1061 
1062   /*  start of first stage process */
1063   do
1064   {
1065     /*  Butterfly implementation */
1066 
1067     /*  Reading i0, i0+fftLen/2 inputs */
1068     /* Read ya (real), xa(imag) input */
1069     T = read_q15x2 (pSi0);
1070     T = __SHADD16(T, 0);
1071     T = __SHADD16(T, 0);
1072 
1073     /* Read yc (real), xc(imag) input */
1074     S = read_q15x2 (pSi2);
1075     S = __SHADD16(S, 0);
1076     S = __SHADD16(S, 0);
1077 
1078     /* R = packed((ya + yc), (xa + xc) ) */
1079     R = __QADD16(T, S);
1080 
1081     /* S = packed((ya - yc), (xa - xc) ) */
1082     S = __QSUB16(T, S);
1083 
1084     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1085     /* Read yb (real), xb(imag) input */
1086     T = read_q15x2 (pSi1);
1087     T = __SHADD16(T, 0);
1088     T = __SHADD16(T, 0);
1089 
1090     /* Read yd (real), xd(imag) input */
1091     U = read_q15x2 (pSi3);
1092     U = __SHADD16(U, 0);
1093     U = __SHADD16(U, 0);
1094 
1095     /* T = packed((yb + yd), (xb + xd) ) */
1096     T = __QADD16(T, U);
1097 
1098     /*  writing the butterfly processed i0 sample */
1099     /* xa' = xa + xb + xc + xd */
1100     /* ya' = ya + yb + yc + yd */
1101     write_q15x2_ia (&pSi0, __SHADD16(R, T));
1102 
1103     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
1104     R = __QSUB16(R, T);
1105 
1106     /* co2 & si2 are read from SIMD Coefficient pointer */
1107     C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
1108 
1109 #ifndef ARM_MATH_BIG_ENDIAN
1110     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1111     out1 = __SMUSD(C2, R) >> 16U;
1112     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1113     out2 = __SMUADX(C2, R);
1114 #else
1115     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1116     out1 = __SMUADX(C2, R) >> 16U;
1117     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1118     out2 = __SMUSD(__QSUB16(0, C2), R);
1119 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1120 
1121     /*  Reading i0+fftLen/4 */
1122     /* T = packed(yb, xb) */
1123     T = read_q15x2 (pSi1);
1124     T = __SHADD16(T, 0);
1125     T = __SHADD16(T, 0);
1126 
1127     /* writing the butterfly processed i0 + fftLen/4 sample */
1128     /* writing output(xc', yc') in little endian format */
1129     write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
1130 
1131     /*  Butterfly calculations */
1132     /* U = packed(yd, xd) */
1133     U = read_q15x2 (pSi3);
1134     U = __SHADD16(U, 0);
1135     U = __SHADD16(U, 0);
1136 
1137     /* T = packed(yb-yd, xb-xd) */
1138     T = __QSUB16(T, U);
1139 
1140 #ifndef ARM_MATH_BIG_ENDIAN
1141     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1142     R = __QSAX(S, T);
1143     /* S = packed((ya-yc) + (xb- xd),  (xa-xc) - (yb-yd)) */
1144     S = __QASX(S, T);
1145 #else
1146     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1147     R = __QASX(S, T);
1148     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1149     S = __QSAX(S, T);
1150 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1151 
1152     /* co1 & si1 are read from SIMD Coefficient pointer */
1153     C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
1154     /*  Butterfly process for the i0+fftLen/2 sample */
1155 
1156 #ifndef ARM_MATH_BIG_ENDIAN
1157     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1158     out1 = __SMUSD(C1, S) >> 16U;
1159     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1160     out2 = __SMUADX(C1, S);
1161 #else
1162     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1163     out1 = __SMUADX(C1, S) >> 16U;
1164     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1165     out2 = __SMUSD(__QSUB16(0, C1), S);
1166 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1167 
1168     /* writing output(xb', yb') in little endian format */
1169     write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
1170 
1171     /* co3 & si3 are read from SIMD Coefficient pointer */
1172     C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
1173     /*  Butterfly process for the i0+3fftLen/4 sample */
1174 
1175 #ifndef ARM_MATH_BIG_ENDIAN
1176     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1177     out1 = __SMUSD(C3, R) >> 16U;
1178     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1179     out2 = __SMUADX(C3, R);
1180 #else
1181     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1182     out1 = __SMUADX(C3, R) >> 16U;
1183     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1184     out2 = __SMUSD(__QSUB16(0, C3), R);
1185 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1186 
1187     /* writing output(xd', yd') in little endian format */
1188     write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
1189 
1190     /*  Twiddle coefficients index modifier */
1191     ic = ic + twidCoefModifier;
1192 
1193   } while (--j);
1194   /* data is in 4.11(q11) format */
1195 
1196   /* end of first stage process */
1197 
1198 
1199   /* start of middle stage process */
1200 
1201   /*  Twiddle coefficients index modifier */
1202   twidCoefModifier <<= 2U;
1203 
1204   /*  Calculation of Middle stage */
1205   for (k = fftLen / 4U; k > 4U; k >>= 2U)
1206   {
1207     /*  Initializations for the middle stage */
1208     n1 = n2;
1209     n2 >>= 2U;
1210     ic = 0U;
1211 
1212     for (j = 0U; j <= (n2 - 1U); j++)
1213     {
1214       /*  index calculation for the coefficients */
1215       C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
1216       C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
1217       C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
1218 
1219       /*  Twiddle coefficients index modifier */
1220       ic = ic + twidCoefModifier;
1221 
1222       pSi0 = pSrc16 + 2 * j;
1223       pSi1 = pSi0 + 2 * n2;
1224       pSi2 = pSi1 + 2 * n2;
1225       pSi3 = pSi2 + 2 * n2;
1226 
1227       /*  Butterfly implementation */
1228       for (i0 = j; i0 < fftLen; i0 += n1)
1229       {
1230         /*  Reading i0, i0+fftLen/2 inputs */
1231         /* Read ya (real), xa(imag) input */
1232         T = read_q15x2 (pSi0);
1233 
1234         /* Read yc (real), xc(imag) input */
1235         S = read_q15x2 (pSi2);
1236 
1237         /* R = packed( (ya + yc), (xa + xc)) */
1238         R = __QADD16(T, S);
1239 
1240         /* S = packed((ya - yc), (xa - xc)) */
1241         S = __QSUB16(T, S);
1242 
1243         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1244         /* Read yb (real), xb(imag) input */
1245         T = read_q15x2 (pSi1);
1246 
1247         /* Read yd (real), xd(imag) input */
1248         U = read_q15x2 (pSi3);
1249 
1250         /* T = packed( (yb + yd), (xb + xd)) */
1251         T = __QADD16(T, U);
1252 
1253         /*  writing the butterfly processed i0 sample */
1254 
1255         /* xa' = xa + xb + xc + xd */
1256         /* ya' = ya + yb + yc + yd */
1257         out1 = __SHADD16(R, T);
1258         out1 = __SHADD16(out1, 0);
1259         write_q15x2 (pSi0, out1);
1260         pSi0 += 2 * n1;
1261 
1262         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
1263         R = __SHSUB16(R, T);
1264 
1265 #ifndef ARM_MATH_BIG_ENDIAN
1266         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1267         out1 = __SMUSD(C2, R) >> 16U;
1268 
1269         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1270         out2 = __SMUADX(C2, R);
1271 #else
1272         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1273         out1 = __SMUADX(R, C2) >> 16U;
1274 
1275         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1276         out2 = __SMUSD(__QSUB16(0, C2), R);
1277 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1278 
1279         /*  Reading i0+3fftLen/4 */
1280         /* Read yb (real), xb(imag) input */
1281         T = read_q15x2 (pSi1);
1282 
1283         /*  writing the butterfly processed i0 + fftLen/4 sample */
1284         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1285         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1286         write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
1287         pSi1 += 2 * n1;
1288 
1289         /*  Butterfly calculations */
1290 
1291         /* Read yd (real), xd(imag) input */
1292         U = read_q15x2 (pSi3);
1293 
1294         /* T = packed(yb-yd, xb-xd) */
1295         T = __QSUB16(T, U);
1296 
1297 #ifndef ARM_MATH_BIG_ENDIAN
1298         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1299         R = __SHSAX(S, T);
1300 
1301         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1302         S = __SHASX(S, T);
1303 
1304         /*  Butterfly process for the i0+fftLen/2 sample */
1305         out1 = __SMUSD(C1, S) >> 16U;
1306         out2 = __SMUADX(C1, S);
1307 #else
1308         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1309         R = __SHASX(S, T);
1310 
1311         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1312         S = __SHSAX(S, T);
1313 
1314         /*  Butterfly process for the i0+fftLen/2 sample */
1315         out1 = __SMUADX(S, C1) >> 16U;
1316         out2 = __SMUSD(__QSUB16(0, C1), S);
1317 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1318 
1319         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1320         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1321         write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
1322         pSi2 += 2 * n1;
1323 
1324         /*  Butterfly process for the i0+3fftLen/4 sample */
1325 
1326 #ifndef ARM_MATH_BIG_ENDIAN
1327         out1 = __SMUSD(C3, R) >> 16U;
1328         out2 = __SMUADX(C3, R);
1329 #else
1330         out1 = __SMUADX(C3, R) >> 16U;
1331         out2 = __SMUSD(__QSUB16(0, C3), R);
1332 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1333 
1334         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1335         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1336         write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
1337         pSi3 += 2 * n1;
1338       }
1339     }
1340     /*  Twiddle coefficients index modifier */
1341     twidCoefModifier <<= 2U;
1342   }
1343   /* end of middle stage process */
1344 
1345   /* data is in 10.6(q6) format for the 1024 point */
1346   /* data is in 8.8(q8) format for the 256 point */
1347   /* data is in 6.10(q10) format for the 64 point */
1348   /* data is in 4.12(q12) format for the 16 point */
1349 
1350   /*  Initializations for the last stage */
1351   j = fftLen >> 2;
1352 
1353   ptr1 = &pSrc16[0];
1354 
1355   /* start of last stage process */
1356 
1357   /*  Butterfly implementation */
1358   do
1359   {
1360     /* Read xa (real), ya(imag) input */
1361     xaya = read_q15x2_ia ((q15_t **) &ptr1);
1362 
1363     /* Read xb (real), yb(imag) input */
1364     xbyb = read_q15x2_ia ((q15_t **) &ptr1);
1365 
1366     /* Read xc (real), yc(imag) input */
1367     xcyc = read_q15x2_ia ((q15_t **) &ptr1);
1368 
1369     /* Read xd (real), yd(imag) input */
1370     xdyd = read_q15x2_ia ((q15_t **) &ptr1);
1371 
1372     /* R = packed((ya + yc), (xa + xc)) */
1373     R = __QADD16(xaya, xcyc);
1374 
1375     /* T = packed((yb + yd), (xb + xd)) */
1376     T = __QADD16(xbyb, xdyd);
1377 
1378     /* pointer updation for writing */
1379     ptr1 = ptr1 - 8U;
1380 
1381 
1382     /* xa' = xa + xb + xc + xd */
1383     /* ya' = ya + yb + yc + yd */
1384     write_q15x2_ia (&ptr1, __SHADD16(R, T));
1385 
1386     /* T = packed((yb + yd), (xb + xd)) */
1387     T = __QADD16(xbyb, xdyd);
1388 
1389     /* xc' = (xa-xb+xc-xd) */
1390     /* yc' = (ya-yb+yc-yd) */
1391     write_q15x2_ia (&ptr1, __SHSUB16(R, T));
1392 
1393     /* S = packed((ya - yc), (xa - xc)) */
1394     S = __QSUB16(xaya, xcyc);
1395 
1396     /* Read yd (real), xd(imag) input */
1397     /* T = packed( (yb - yd), (xb - xd))  */
1398     U = __QSUB16(xbyb, xdyd);
1399 
1400 #ifndef ARM_MATH_BIG_ENDIAN
1401     /* xb' = (xa+yb-xc-yd) */
1402     /* yb' = (ya-xb-yc+xd) */
1403     write_q15x2_ia (&ptr1, __SHASX(S, U));
1404 
1405     /* xd' = (xa-yb-xc+yd) */
1406     /* yd' = (ya+xb-yc-xd) */
1407     write_q15x2_ia (&ptr1, __SHSAX(S, U));
1408 #else
1409     /* xb' = (xa+yb-xc-yd) */
1410     /* yb' = (ya-xb-yc+xd) */
1411     write_q15x2_ia (&ptr1, __SHSAX(S, U));
1412 
1413     /* xd' = (xa-yb-xc+yd) */
1414     /* yd' = (ya+xb-yc-xd) */
1415     write_q15x2_ia (&ptr1, __SHASX(S, U));
1416 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1417 
1418   } while (--j);
1419 
1420   /* end of last stage  process */
1421 
1422   /* output is in 11.5(q5) format for the 1024 point */
1423   /* output is in 9.7(q7) format for the 256 point   */
1424   /* output is in 7.9(q9) format for the 64 point  */
1425   /* output is in 5.11(q11) format for the 16 point  */
1426 
1427 
1428 #else /* arm_radix4_butterfly_inverse_q15 */
1429 
1430         q15_t R0, R1, S0, S1, T0, T1, U0, U1;
1431         q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
1432         uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
1433 
1434   /* Total process is divided into three stages */
1435 
1436   /* process first stage, middle stages, & last stage */
1437 
1438   /*  Initializations for the first stage */
1439   n2 = fftLen;
1440   n1 = n2;
1441 
1442   /* n2 = fftLen/4 */
1443   n2 >>= 2U;
1444 
1445   /* Index for twiddle coefficient */
1446   ic = 0U;
1447 
1448   /* Index for input read and output write */
1449   i0 = 0U;
1450 
1451   j = n2;
1452 
1453   /* Input is in 1.15(q15) format */
1454 
1455   /*  Start of first stage process */
1456   do
1457   {
1458     /*  Butterfly implementation */
1459 
1460     /*  index calculation for the input as, */
1461     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1462     i1 = i0 + n2;
1463     i2 = i1 + n2;
1464     i3 = i2 + n2;
1465 
1466     /*  Reading i0, i0+fftLen/2 inputs */
1467     /* input is down scale by 4 to avoid overflow */
1468     /* Read ya (real), xa(imag) input */
1469     T0 = pSrc16[i0 * 2U] >> 2U;
1470     T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
1471     /* input is down scale by 4 to avoid overflow */
1472     /* Read yc (real), xc(imag) input */
1473     S0 = pSrc16[i2 * 2U] >> 2U;
1474     S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
1475 
1476     /* R0 = (ya + yc), R1 = (xa + xc) */
1477     R0 = __SSAT(T0 + S0, 16U);
1478     R1 = __SSAT(T1 + S1, 16U);
1479     /* S0 = (ya - yc), S1 = (xa - xc) */
1480     S0 = __SSAT(T0 - S0, 16U);
1481     S1 = __SSAT(T1 - S1, 16U);
1482 
1483     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1484     /* input is down scale by 4 to avoid overflow */
1485     /* Read yb (real), xb(imag) input */
1486     T0 = pSrc16[i1 * 2U] >> 2U;
1487     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
1488     /* Read yd (real), xd(imag) input */
1489     /* input is down scale by 4 to avoid overflow */
1490     U0 = pSrc16[i3 * 2U] >> 2U;
1491     U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
1492 
1493     /* T0 = (yb + yd), T1 = (xb + xd) */
1494     T0 = __SSAT(T0 + U0, 16U);
1495     T1 = __SSAT(T1 + U1, 16U);
1496 
1497     /*  writing the butterfly processed i0 sample */
1498     /* xa' = xa + xb + xc + xd */
1499     /* ya' = ya + yb + yc + yd */
1500     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
1501     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
1502 
1503     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
1504     R0 = __SSAT(R0 - T0, 16U);
1505     R1 = __SSAT(R1 - T1, 16U);
1506     /* co2 & si2 are read from Coefficient pointer */
1507     Co2 = pCoef16[2U * ic * 2U];
1508     Si2 = pCoef16[(2U * ic * 2U) + 1U];
1509     /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1510     out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
1511     /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1512     out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
1513 
1514     /*  Reading i0+fftLen/4 */
1515     /* input is down scale by 4 to avoid overflow */
1516     /* T0 = yb, T1 = xb */
1517     T0 = pSrc16[i1 * 2U] >> 2U;
1518     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
1519 
1520     /* writing the butterfly processed i0 + fftLen/4 sample */
1521     /* writing output(xc', yc') in little endian format */
1522     pSrc16[i1 * 2U] = out1;
1523     pSrc16[(i1 * 2U) + 1U] = out2;
1524 
1525     /*  Butterfly calculations */
1526     /* input is down scale by 4 to avoid overflow */
1527     /* U0 = yd, U1 = xd) */
1528     U0 = pSrc16[i3 * 2U] >> 2U;
1529     U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
1530 
1531     /* T0 = yb-yd, T1 = xb-xd) */
1532     T0 = __SSAT(T0 - U0, 16U);
1533     T1 = __SSAT(T1 - U1, 16U);
1534     /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1535     R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
1536     R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
1537     /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1538     S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
1539     S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
1540 
1541     /* co1 & si1 are read from Coefficient pointer */
1542     Co1 = pCoef16[ic * 2U];
1543     Si1 = pCoef16[(ic * 2U) + 1U];
1544     /*  Butterfly process for the i0+fftLen/2 sample */
1545     /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1546     out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
1547     /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1548     out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
1549     /* writing output(xb', yb') in little endian format */
1550     pSrc16[i2 * 2U] = out1;
1551     pSrc16[(i2 * 2U) + 1U] = out2;
1552 
1553     /* Co3 & si3 are read from Coefficient pointer */
1554     Co3 = pCoef16[3U * ic * 2U];
1555     Si3 = pCoef16[(3U * ic * 2U) + 1U];
1556     /*  Butterfly process for the i0+3fftLen/4 sample */
1557     /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1558     out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
1559     /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1560     out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
1561     /* writing output(xd', yd') in little endian format */
1562     pSrc16[i3 * 2U] = out1;
1563     pSrc16[(i3 * 2U) + 1U] = out2;
1564 
1565     /*  Twiddle coefficients index modifier */
1566     ic = ic + twidCoefModifier;
1567 
1568     /*  Updating input index */
1569     i0 = i0 + 1U;
1570 
1571   } while (--j);
1572 
1573   /*  End of first stage process */
1574 
1575   /* data is in 4.11(q11) format */
1576 
1577 
1578   /*  Start of Middle stage process */
1579 
1580   /*  Twiddle coefficients index modifier */
1581   twidCoefModifier <<= 2U;
1582 
1583   /*  Calculation of Middle stage */
1584   for (k = fftLen / 4U; k > 4U; k >>= 2U)
1585   {
1586     /*  Initializations for the middle stage */
1587     n1 = n2;
1588     n2 >>= 2U;
1589     ic = 0U;
1590 
1591     for (j = 0U; j <= (n2 - 1U); j++)
1592     {
1593       /*  index calculation for the coefficients */
1594       Co1 = pCoef16[ic * 2U];
1595       Si1 = pCoef16[(ic * 2U) + 1U];
1596       Co2 = pCoef16[2U * ic * 2U];
1597       Si2 = pCoef16[2U * ic * 2U + 1U];
1598       Co3 = pCoef16[3U * ic * 2U];
1599       Si3 = pCoef16[(3U * ic * 2U) + 1U];
1600 
1601       /*  Twiddle coefficients index modifier */
1602       ic = ic + twidCoefModifier;
1603 
1604       /*  Butterfly implementation */
1605       for (i0 = j; i0 < fftLen; i0 += n1)
1606       {
1607         /*  index calculation for the input as, */
1608         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1609         i1 = i0 + n2;
1610         i2 = i1 + n2;
1611         i3 = i2 + n2;
1612 
1613         /*  Reading i0, i0+fftLen/2 inputs */
1614         /* Read ya (real), xa(imag) input */
1615         T0 = pSrc16[i0 * 2U];
1616         T1 = pSrc16[(i0 * 2U) + 1U];
1617 
1618         /* Read yc (real), xc(imag) input */
1619         S0 = pSrc16[i2 * 2U];
1620         S1 = pSrc16[(i2 * 2U) + 1U];
1621 
1622 
1623         /* R0 = (ya + yc), R1 = (xa + xc) */
1624         R0 = __SSAT(T0 + S0, 16U);
1625         R1 = __SSAT(T1 + S1, 16U);
1626         /* S0 = (ya - yc), S1 = (xa - xc) */
1627         S0 = __SSAT(T0 - S0, 16U);
1628         S1 = __SSAT(T1 - S1, 16U);
1629 
1630         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1631         /* Read yb (real), xb(imag) input */
1632         T0 = pSrc16[i1 * 2U];
1633         T1 = pSrc16[(i1 * 2U) + 1U];
1634 
1635         /* Read yd (real), xd(imag) input */
1636         U0 = pSrc16[i3 * 2U];
1637         U1 = pSrc16[(i3 * 2U) + 1U];
1638 
1639         /* T0 = (yb + yd), T1 = (xb + xd) */
1640         T0 = __SSAT(T0 + U0, 16U);
1641         T1 = __SSAT(T1 + U1, 16U);
1642 
1643         /*  writing the butterfly processed i0 sample */
1644         /* xa' = xa + xb + xc + xd */
1645         /* ya' = ya + yb + yc + yd */
1646         pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
1647         pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
1648 
1649         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1650         R0 = (R0 >> 1U) - (T0 >> 1U);
1651         R1 = (R1 >> 1U) - (T1 >> 1U);
1652 
1653         /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
1654         out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
1655         /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1656         out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
1657 
1658         /*  Reading i0+3fftLen/4 */
1659         /* Read yb (real), xb(imag) input */
1660         T0 = pSrc16[i1 * 2U];
1661         T1 = pSrc16[(i1 * 2U) + 1U];
1662 
1663         /*  writing the butterfly processed i0 + fftLen/4 sample */
1664         /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1665         /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1666         pSrc16[i1 * 2U] = out1;
1667         pSrc16[(i1 * 2U) + 1U] = out2;
1668 
1669         /*  Butterfly calculations */
1670         /* Read yd (real), xd(imag) input */
1671         U0 = pSrc16[i3 * 2U];
1672         U1 = pSrc16[(i3 * 2U) + 1U];
1673 
1674         /* T0 = yb-yd, T1 = xb-xd) */
1675         T0 = __SSAT(T0 - U0, 16U);
1676         T1 = __SSAT(T1 - U1, 16U);
1677 
1678         /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1679         R0 = (S0 >> 1U) + (T1 >> 1U);
1680         R1 = (S1 >> 1U) - (T0 >> 1U);
1681 
1682         /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1683         S0 = (S0 >> 1U) - (T1 >> 1U);
1684         S1 = (S1 >> 1U) + (T0 >> 1U);
1685 
1686         /*  Butterfly process for the i0+fftLen/2 sample */
1687         out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
1688         out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
1689         /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1690         /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1691         pSrc16[i2 * 2U] = out1;
1692         pSrc16[(i2 * 2U) + 1U] = out2;
1693 
1694         /*  Butterfly process for the i0+3fftLen/4 sample */
1695         out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
1696 
1697         out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
1698         /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1699         /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1700         pSrc16[i3 * 2U] = out1;
1701         pSrc16[(i3 * 2U) + 1U] = out2;
1702 
1703 
1704       }
1705     }
1706     /*  Twiddle coefficients index modifier */
1707     twidCoefModifier <<= 2U;
1708   }
1709   /*  End of Middle stages process */
1710 
1711 
1712   /* data is in 10.6(q6) format for the 1024 point */
1713   /* data is in 8.8(q8) format for the 256 point   */
1714   /* data is in 6.10(q10) format for the 64 point  */
1715   /* data is in 4.12(q12) format for the 16 point  */
1716 
1717   /* start of last stage process */
1718 
1719 
1720   /*  Initializations for the last stage */
1721   n1 = n2;
1722   n2 >>= 2U;
1723 
1724   /*  Butterfly implementation */
1725   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
1726   {
1727     /*  index calculation for the input as, */
1728     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1729     i1 = i0 + n2;
1730     i2 = i1 + n2;
1731     i3 = i2 + n2;
1732 
1733     /*  Reading i0, i0+fftLen/2 inputs */
1734     /* Read ya (real), xa(imag) input */
1735     T0 = pSrc16[i0 * 2U];
1736     T1 = pSrc16[(i0 * 2U) + 1U];
1737     /* Read yc (real), xc(imag) input */
1738     S0 = pSrc16[i2 * 2U];
1739     S1 = pSrc16[(i2 * 2U) + 1U];
1740 
1741     /* R0 = (ya + yc), R1 = (xa + xc) */
1742     R0 = __SSAT(T0 + S0, 16U);
1743     R1 = __SSAT(T1 + S1, 16U);
1744     /* S0 = (ya - yc), S1 = (xa - xc) */
1745     S0 = __SSAT(T0 - S0, 16U);
1746     S1 = __SSAT(T1 - S1, 16U);
1747 
1748     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1749     /* Read yb (real), xb(imag) input */
1750     T0 = pSrc16[i1 * 2U];
1751     T1 = pSrc16[(i1 * 2U) + 1U];
1752     /* Read yd (real), xd(imag) input */
1753     U0 = pSrc16[i3 * 2U];
1754     U1 = pSrc16[(i3 * 2U) + 1U];
1755 
1756     /* T0 = (yb + yd), T1 = (xb + xd) */
1757     T0 = __SSAT(T0 + U0, 16U);
1758     T1 = __SSAT(T1 + U1, 16U);
1759 
1760     /*  writing the butterfly processed i0 sample */
1761     /* xa' = xa + xb + xc + xd */
1762     /* ya' = ya + yb + yc + yd */
1763     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
1764     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
1765 
1766     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1767     R0 = (R0 >> 1U) - (T0 >> 1U);
1768     R1 = (R1 >> 1U) - (T1 >> 1U);
1769 
1770     /* Read yb (real), xb(imag) input */
1771     T0 = pSrc16[i1 * 2U];
1772     T1 = pSrc16[(i1 * 2U) + 1U];
1773 
1774     /*  writing the butterfly processed i0 + fftLen/4 sample */
1775     /* xc' = (xa-xb+xc-xd) */
1776     /* yc' = (ya-yb+yc-yd) */
1777     pSrc16[i1 * 2U] = R0;
1778     pSrc16[(i1 * 2U) + 1U] = R1;
1779 
1780     /* Read yd (real), xd(imag) input */
1781     U0 = pSrc16[i3 * 2U];
1782     U1 = pSrc16[(i3 * 2U) + 1U];
1783     /* T0 = (yb - yd), T1 = (xb - xd) */
1784     T0 = __SSAT(T0 - U0, 16U);
1785     T1 = __SSAT(T1 - U1, 16U);
1786 
1787     /*  writing the butterfly processed i0 + fftLen/2 sample */
1788     /* xb' = (xa-yb-xc+yd) */
1789     /* yb' = (ya+xb-yc-xd) */
1790     pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
1791     pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
1792 
1793 
1794     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
1795     /* xd' = (xa+yb-xc-yd) */
1796     /* yd' = (ya-xb-yc+xd) */
1797     pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
1798     pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
1799   }
1800   /* end of last stage  process */
1801 
1802   /* output is in 11.5(q5) format for the 1024 point */
1803   /* output is in 9.7(q7) format for the 256 point   */
1804   /* output is in 7.9(q9) format for the 64 point  */
1805   /* output is in 5.11(q11) format for the 16 point  */
1806 
1807 #endif /* #if defined (ARM_MATH_DSP) */
1808 
1809 }
1810