1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_radix4_f16.c
4  * Description:  Radix-4 Decimation in Frequency CFFT & CIFFT Floating point processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions_f16.h"
30 
31 #if defined(ARM_FLOAT16_SUPPORTED)
32 
33 extern void arm_bitreversal_f16(
34         float16_t * pSrc,
35         uint16_t fftSize,
36         uint16_t bitRevFactor,
37   const uint16_t * pBitRevTab);
38 
39 ARM_DSP_ATTRIBUTE void arm_radix4_butterfly_f16(
40         float16_t * pSrc,
41         uint16_t fftLen,
42   const float16_t * pCoef,
43         uint16_t twidCoefModifier);
44 
45 ARM_DSP_ATTRIBUTE void arm_radix4_butterfly_inverse_f16(
46         float16_t * pSrc,
47         uint16_t fftLen,
48   const float16_t * pCoef,
49         uint16_t twidCoefModifier,
50         float16_t onebyfftLen);
51 
52 
53 ARM_DSP_ATTRIBUTE void arm_cfft_radix4by2_f16(
54     float16_t * pSrc,
55     uint32_t fftLen,
56     const float16_t * pCoef);
57 
58 
59 /**
60   @addtogroup ComplexFFTDeprecated
61   @{
62  */
63 
64 /*
65 * @brief  Core function for the floating-point CFFT butterfly process.
66 * @param[in, out] *pSrc            points to the in-place buffer of floating-point data type.
67 * @param[in]      fftLen           length of the FFT.
68 * @param[in]      *pCoef           points to the twiddle coefficient buffer.
69 * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
70 */
71 
arm_cfft_radix4by2_f16(float16_t * pSrc,uint32_t fftLen,const float16_t * pCoef)72 ARM_DSP_ATTRIBUTE void arm_cfft_radix4by2_f16(
73     float16_t * pSrc,
74     uint32_t fftLen,
75     const float16_t * pCoef)
76 {
77     uint32_t i, l;
78     uint32_t n2, ia;
79     float16_t xt, yt, cosVal, sinVal;
80     float16_t p0, p1,p2,p3,a0,a1;
81 
82     n2 = fftLen >> 1;
83     ia = 0;
84     for (i = 0; i < n2; i++)
85     {
86         cosVal = pCoef[2*ia];
87         sinVal = pCoef[2*ia + 1];
88         ia++;
89 
90         l = i + n2;
91 
92         /*  Butterfly implementation */
93         a0 = (_Float16)pSrc[2 * i] + (_Float16)pSrc[2 * l];
94         xt = (_Float16)pSrc[2 * i] - (_Float16)pSrc[2 * l];
95 
96         yt = (_Float16)pSrc[2 * i + 1] - (_Float16)pSrc[2 * l + 1];
97         a1 = (_Float16)pSrc[2 * l + 1] + (_Float16)pSrc[2 * i + 1];
98 
99         p0 = (_Float16)xt * (_Float16)cosVal;
100         p1 = (_Float16)yt * (_Float16)sinVal;
101         p2 = (_Float16)yt * (_Float16)cosVal;
102         p3 = (_Float16)xt * (_Float16)sinVal;
103 
104         pSrc[2 * i]     = a0;
105         pSrc[2 * i + 1] = a1;
106 
107         pSrc[2 * l]     = (_Float16)p0 + (_Float16)p1;
108         pSrc[2 * l + 1] = (_Float16)p2 - (_Float16)p3;
109 
110     }
111 
112     // first col
113     arm_radix4_butterfly_f16( pSrc, n2, (float16_t*)pCoef, 2U);
114     // second col
115     arm_radix4_butterfly_f16( pSrc + fftLen, n2, (float16_t*)pCoef, 2U);
116 
117 }
118 
119 
120 /**
121   @brief         Processing function for the floating-point Radix-4 CFFT/CIFFT.
122   @deprecated    Do not use this function. It has been superseded by \ref arm_cfft_f16 and will be removed in the future.
123   @param[in]     S    points to an instance of the floating-point Radix-4 CFFT/CIFFT structure
124   @param[in,out] pSrc points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
125  */
126 
arm_cfft_radix4_f16(const arm_cfft_radix4_instance_f16 * S,float16_t * pSrc)127 ARM_DSP_ATTRIBUTE void arm_cfft_radix4_f16(
128   const arm_cfft_radix4_instance_f16 * S,
129         float16_t * pSrc)
130 {
131    if (S->ifftFlag == 1U)
132    {
133       /*  Complex IFFT radix-4  */
134       arm_radix4_butterfly_inverse_f16(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier, S->onebyfftLen);
135    }
136    else
137    {
138       /*  Complex FFT radix-4  */
139       arm_radix4_butterfly_f16(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
140    }
141 
142    if (S->bitReverseFlag == 1U)
143    {
144       /*  Bit Reversal */
145       arm_bitreversal_f16(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
146    }
147 
148 }
149 
150 /**
151   @} end of ComplexFFTDeprecated group
152  */
153 
154 /* ----------------------------------------------------------------------
155  * Internal helper function used by the FFTs
156  * ---------------------------------------------------------------------- */
157 
158 /*
159 * @brief  Core function for the floating-point CFFT butterfly process.
160 * @param[in, out] *pSrc            points to the in-place buffer of floating-point data type.
161 * @param[in]      fftLen           length of the FFT.
162 * @param[in]      *pCoef           points to the twiddle coefficient buffer.
163 * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
164 */
165 
arm_radix4_butterfly_f16(float16_t * pSrc,uint16_t fftLen,const float16_t * pCoef,uint16_t twidCoefModifier)166 ARM_DSP_ATTRIBUTE void arm_radix4_butterfly_f16(
167 float16_t * pSrc,
168 uint16_t fftLen,
169 const float16_t * pCoef,
170 uint16_t twidCoefModifier)
171 {
172 
173    float16_t co1, co2, co3, si1, si2, si3;
174    uint32_t ia1, ia2, ia3;
175    uint32_t i0, i1, i2, i3;
176    uint32_t n1, n2, j, k;
177 
178 #if defined (ARM_MATH_DSP)
179 
180    /* Run the below code for Cortex-M4 and Cortex-M3 */
181 
182    float16_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
183    float16_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
184    Ybminusd;
185    float16_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
186    float16_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
187    float16_t *ptr1;
188    float16_t p0,p1,p2,p3,p4,p5;
189    float16_t a0,a1,a2,a3,a4,a5,a6,a7;
190 
191    /*  Initializations for the first stage */
192    n2 = fftLen;
193    n1 = n2;
194 
195    /* n2 = fftLen/4 */
196    n2 >>= 2U;
197    i0 = 0U;
198    ia1 = 0U;
199 
200    j = n2;
201 
202    /*  Calculation of first stage */
203    do
204    {
205       /*  index calculation for the input as, */
206       /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
207       i1 = i0 + n2;
208       i2 = i1 + n2;
209       i3 = i2 + n2;
210 
211       xaIn = pSrc[(2U * i0)];
212       yaIn = pSrc[(2U * i0) + 1U];
213 
214       xbIn = pSrc[(2U * i1)];
215       ybIn = pSrc[(2U * i1) + 1U];
216 
217       xcIn = pSrc[(2U * i2)];
218       ycIn = pSrc[(2U * i2) + 1U];
219 
220       xdIn = pSrc[(2U * i3)];
221       ydIn = pSrc[(2U * i3) + 1U];
222 
223       /* xa + xc */
224       Xaplusc = (_Float16)xaIn + (_Float16)xcIn;
225       /* xb + xd */
226       Xbplusd = (_Float16)xbIn + (_Float16)xdIn;
227       /* ya + yc */
228       Yaplusc = (_Float16)yaIn + (_Float16)ycIn;
229       /* yb + yd */
230       Ybplusd = (_Float16)ybIn + (_Float16)ydIn;
231 
232       /*  index calculation for the coefficients */
233       ia2 = ia1 + ia1;
234       co2 = pCoef[ia2 * 2U];
235       si2 = pCoef[(ia2 * 2U) + 1U];
236 
237       /* xa - xc */
238       Xaminusc = (_Float16)xaIn - (_Float16)xcIn;
239       /* xb - xd */
240       Xbminusd = (_Float16)xbIn - (_Float16)xdIn;
241       /* ya - yc */
242       Yaminusc = (_Float16)yaIn - (_Float16)ycIn;
243       /* yb - yd */
244       Ybminusd = (_Float16)ybIn - (_Float16)ydIn;
245 
246       /* xa' = xa + xb + xc + xd */
247       pSrc[(2U * i0)] = (_Float16)Xaplusc + (_Float16)Xbplusd;
248       /* ya' = ya + yb + yc + yd */
249       pSrc[(2U * i0) + 1U] = (_Float16)Yaplusc + (_Float16)Ybplusd;
250 
251       /* (xa - xc) + (yb - yd) */
252       Xb12C_out = ((_Float16)Xaminusc + (_Float16)Ybminusd);
253       /* (ya - yc) + (xb - xd) */
254       Yb12C_out = ((_Float16)Yaminusc - (_Float16)Xbminusd);
255       /* (xa + xc) - (xb + xd) */
256       Xc12C_out = ((_Float16)Xaplusc - (_Float16)Xbplusd);
257       /* (ya + yc) - (yb + yd) */
258       Yc12C_out = ((_Float16)Yaplusc - (_Float16)Ybplusd);
259       /* (xa - xc) - (yb - yd) */
260       Xd12C_out = ((_Float16)Xaminusc - (_Float16)Ybminusd);
261       /* (ya - yc) + (xb - xd) */
262       Yd12C_out = ((_Float16)Xbminusd + (_Float16)Yaminusc);
263 
264       co1 = pCoef[ia1 * 2U];
265       si1 = pCoef[(ia1 * 2U) + 1U];
266 
267       /*  index calculation for the coefficients */
268       ia3 = ia2 + ia1;
269       co3 = pCoef[ia3 * 2U];
270       si3 = pCoef[(ia3 * 2U) + 1U];
271 
272       Xb12_out = (_Float16)Xb12C_out * (_Float16)co1;
273       Yb12_out = (_Float16)Yb12C_out * (_Float16)co1;
274       Xc12_out = (_Float16)Xc12C_out * (_Float16)co2;
275       Yc12_out = (_Float16)Yc12C_out * (_Float16)co2;
276       Xd12_out = (_Float16)Xd12C_out * (_Float16)co3;
277       Yd12_out = (_Float16)Yd12C_out * (_Float16)co3;
278 
279       /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
280       //Xb12_out -= Yb12C_out * si1;
281       p0 = (_Float16)Yb12C_out * (_Float16)si1;
282       /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
283       //Yb12_out += Xb12C_out * si1;
284       p1 = (_Float16)Xb12C_out * (_Float16)si1;
285       /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
286       //Xc12_out -= Yc12C_out * si2;
287       p2 = (_Float16)Yc12C_out * (_Float16)si2;
288       /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
289       //Yc12_out += Xc12C_out * si2;
290       p3 = (_Float16)Xc12C_out * (_Float16)si2;
291       /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
292       //Xd12_out -= Yd12C_out * si3;
293       p4 = (_Float16)Yd12C_out * (_Float16)si3;
294       /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
295       //Yd12_out += Xd12C_out * si3;
296       p5 = (_Float16)Xd12C_out * (_Float16)si3;
297 
298       Xb12_out += (_Float16)p0;
299       Yb12_out -= (_Float16)p1;
300       Xc12_out += (_Float16)p2;
301       Yc12_out -= (_Float16)p3;
302       Xd12_out += (_Float16)p4;
303       Yd12_out -= (_Float16)p5;
304 
305       /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
306       pSrc[2U * i1] = Xc12_out;
307 
308       /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
309       pSrc[(2U * i1) + 1U] = Yc12_out;
310 
311       /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
312       pSrc[2U * i2] = Xb12_out;
313 
314       /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
315       pSrc[(2U * i2) + 1U] = Yb12_out;
316 
317       /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
318       pSrc[2U * i3] = Xd12_out;
319 
320       /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
321       pSrc[(2U * i3) + 1U] = Yd12_out;
322 
323       /*  Twiddle coefficients index modifier */
324       ia1 += twidCoefModifier;
325 
326       /*  Updating input index */
327       i0++;
328 
329    }
330    while (--j);
331 
332    twidCoefModifier <<= 2U;
333 
334    /*  Calculation of second stage to excluding last stage */
335    for (k = fftLen >> 2U; k > 4U; k >>= 2U)
336    {
337       /*  Initializations for the first stage */
338       n1 = n2;
339       n2 >>= 2U;
340       ia1 = 0U;
341 
342       /*  Calculation of first stage */
343       j = 0;
344       do
345       {
346          /*  index calculation for the coefficients */
347          ia2 = ia1 + ia1;
348          ia3 = ia2 + ia1;
349          co1 = pCoef[ia1 * 2U];
350          si1 = pCoef[(ia1 * 2U) + 1U];
351          co2 = pCoef[ia2 * 2U];
352          si2 = pCoef[(ia2 * 2U) + 1U];
353          co3 = pCoef[ia3 * 2U];
354          si3 = pCoef[(ia3 * 2U) + 1U];
355 
356          /*  Twiddle coefficients index modifier */
357          ia1 += twidCoefModifier;
358 
359          i0 = j;
360          do
361          {
362             /*  index calculation for the input as, */
363             /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
364             i1 = i0 + n2;
365             i2 = i1 + n2;
366             i3 = i2 + n2;
367 
368             xaIn = pSrc[(2U * i0)];
369             yaIn = pSrc[(2U * i0) + 1U];
370 
371             xbIn = pSrc[(2U * i1)];
372             ybIn = pSrc[(2U * i1) + 1U];
373 
374             xcIn = pSrc[(2U * i2)];
375             ycIn = pSrc[(2U * i2) + 1U];
376 
377             xdIn = pSrc[(2U * i3)];
378             ydIn = pSrc[(2U * i3) + 1U];
379 
380             /* xa - xc */
381             Xaminusc = (_Float16)xaIn - (_Float16)xcIn;
382             /* (xb - xd) */
383             Xbminusd = (_Float16)xbIn - (_Float16)xdIn;
384             /* ya - yc */
385             Yaminusc = (_Float16)yaIn - (_Float16)ycIn;
386             /* (yb - yd) */
387             Ybminusd = (_Float16)ybIn - (_Float16)ydIn;
388 
389             /* xa + xc */
390             Xaplusc = (_Float16)xaIn + (_Float16)xcIn;
391             /* xb + xd */
392             Xbplusd = (_Float16)xbIn + (_Float16)xdIn;
393             /* ya + yc */
394             Yaplusc = (_Float16)yaIn + (_Float16)ycIn;
395             /* yb + yd */
396             Ybplusd = (_Float16)ybIn + (_Float16)ydIn;
397 
398             /* (xa - xc) + (yb - yd) */
399             Xb12C_out = ((_Float16)Xaminusc + (_Float16)Ybminusd);
400             /* (ya - yc) -  (xb - xd) */
401             Yb12C_out = ((_Float16)Yaminusc - (_Float16)Xbminusd);
402             /* xa + xc -(xb + xd) */
403             Xc12C_out = ((_Float16)Xaplusc - (_Float16)Xbplusd);
404             /* (ya + yc) - (yb + yd) */
405             Yc12C_out = ((_Float16)Yaplusc - (_Float16)Ybplusd);
406             /* (xa - xc) - (yb - yd) */
407             Xd12C_out = ((_Float16)Xaminusc - (_Float16)Ybminusd);
408             /* (ya - yc) +  (xb - xd) */
409             Yd12C_out = ((_Float16)Xbminusd + (_Float16)Yaminusc);
410 
411             pSrc[(2U * i0)] = (_Float16)Xaplusc + (_Float16)Xbplusd;
412             pSrc[(2U * i0) + 1U] = (_Float16)Yaplusc + (_Float16)Ybplusd;
413 
414             Xb12_out = (_Float16)Xb12C_out * (_Float16)co1;
415             Yb12_out = (_Float16)Yb12C_out * (_Float16)co1;
416             Xc12_out = (_Float16)Xc12C_out * (_Float16)co2;
417             Yc12_out = (_Float16)Yc12C_out * (_Float16)co2;
418             Xd12_out = (_Float16)Xd12C_out * (_Float16)co3;
419             Yd12_out = (_Float16)Yd12C_out * (_Float16)co3;
420 
421             /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
422             //Xb12_out -= Yb12C_out * si1;
423             p0 = (_Float16)Yb12C_out * (_Float16)si1;
424             /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
425             //Yb12_out += Xb12C_out * si1;
426             p1 = (_Float16)Xb12C_out * (_Float16)si1;
427             /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
428             //Xc12_out -= Yc12C_out * si2;
429             p2 = (_Float16)Yc12C_out * (_Float16)si2;
430             /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
431             //Yc12_out += Xc12C_out * si2;
432             p3 = (_Float16)Xc12C_out * (_Float16)si2;
433             /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
434             //Xd12_out -= Yd12C_out * si3;
435             p4 = (_Float16)Yd12C_out * (_Float16)si3;
436             /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
437             //Yd12_out += Xd12C_out * si3;
438             p5 = (_Float16)Xd12C_out * (_Float16)si3;
439 
440             Xb12_out += (_Float16)p0;
441             Yb12_out -= (_Float16)p1;
442             Xc12_out += (_Float16)p2;
443             Yc12_out -= (_Float16)p3;
444             Xd12_out += (_Float16)p4;
445             Yd12_out -= (_Float16)p5;
446 
447             /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
448             pSrc[2U * i1] = Xc12_out;
449 
450             /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
451             pSrc[(2U * i1) + 1U] = Yc12_out;
452 
453             /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
454             pSrc[2U * i2] = Xb12_out;
455 
456             /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
457             pSrc[(2U * i2) + 1U] = Yb12_out;
458 
459             /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
460             pSrc[2U * i3] = Xd12_out;
461 
462             /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
463             pSrc[(2U * i3) + 1U] = Yd12_out;
464 
465             i0 += n1;
466          } while (i0 < fftLen);
467          j++;
468       } while (j <= (n2 - 1U));
469       twidCoefModifier <<= 2U;
470    }
471 
472    j = fftLen >> 2;
473    ptr1 = &pSrc[0];
474 
475    /*  Calculations of last stage */
476    do
477    {
478       xaIn = ptr1[0];
479       yaIn = ptr1[1];
480       xbIn = ptr1[2];
481       ybIn = ptr1[3];
482       xcIn = ptr1[4];
483       ycIn = ptr1[5];
484       xdIn = ptr1[6];
485       ydIn = ptr1[7];
486 
487       /* xa + xc */
488       Xaplusc = (_Float16)xaIn + (_Float16)xcIn;
489 
490       /* xa - xc */
491       Xaminusc = (_Float16)xaIn - (_Float16)xcIn;
492 
493       /* ya + yc */
494       Yaplusc = (_Float16)yaIn + (_Float16)ycIn;
495 
496       /* ya - yc */
497       Yaminusc = (_Float16)yaIn - (_Float16)ycIn;
498 
499       /* xb + xd */
500       Xbplusd = (_Float16)xbIn + (_Float16)xdIn;
501 
502       /* yb + yd */
503       Ybplusd = (_Float16)ybIn + (_Float16)ydIn;
504 
505       /* (xb-xd) */
506       Xbminusd = (_Float16)xbIn - (_Float16)xdIn;
507 
508       /* (yb-yd) */
509       Ybminusd = (_Float16)ybIn - (_Float16)ydIn;
510 
511       /* xa' = xa + xb + xc + xd */
512       a0 = ((_Float16)Xaplusc + (_Float16)Xbplusd);
513       /* ya' = ya + yb + yc + yd */
514       a1 = ((_Float16)Yaplusc + (_Float16)Ybplusd);
515       /* xc' = (xa-xb+xc-xd) */
516       a2 = ((_Float16)Xaplusc - (_Float16)Xbplusd);
517       /* yc' = (ya-yb+yc-yd) */
518       a3 = ((_Float16)Yaplusc - (_Float16)Ybplusd);
519       /* xb' = (xa+yb-xc-yd) */
520       a4 = ((_Float16)Xaminusc + (_Float16)Ybminusd);
521       /* yb' = (ya-xb-yc+xd) */
522       a5 = ((_Float16)Yaminusc - (_Float16)Xbminusd);
523       /* xd' = (xa-yb-xc+yd)) */
524       a6 = ((_Float16)Xaminusc - (_Float16)Ybminusd);
525       /* yd' = (ya+xb-yc-xd) */
526       a7 = ((_Float16)Xbminusd + (_Float16)Yaminusc);
527 
528       ptr1[0] = a0;
529       ptr1[1] = a1;
530       ptr1[2] = a2;
531       ptr1[3] = a3;
532       ptr1[4] = a4;
533       ptr1[5] = a5;
534       ptr1[6] = a6;
535       ptr1[7] = a7;
536 
537       /* increment pointer by 8 */
538       ptr1 += 8U;
539    } while (--j);
540 
541 #else
542 
543    float16_t t1, t2, r1, r2, s1, s2;
544 
545    /* Run the below code for Cortex-M0 */
546 
547    /*  Initializations for the fft calculation */
548    n2 = fftLen;
549    n1 = n2;
550    for (k = fftLen; k > 1U; k >>= 2U)
551    {
552       /*  Initializations for the fft calculation */
553       n1 = n2;
554       n2 >>= 2U;
555       ia1 = 0U;
556 
557       /*  FFT Calculation */
558       j = 0;
559       do
560       {
561          /*  index calculation for the coefficients */
562          ia2 = ia1 + ia1;
563          ia3 = ia2 + ia1;
564          co1 = pCoef[ia1 * 2U];
565          si1 = pCoef[(ia1 * 2U) + 1U];
566          co2 = pCoef[ia2 * 2U];
567          si2 = pCoef[(ia2 * 2U) + 1U];
568          co3 = pCoef[ia3 * 2U];
569          si3 = pCoef[(ia3 * 2U) + 1U];
570 
571          /*  Twiddle coefficients index modifier */
572          ia1 = ia1 + twidCoefModifier;
573 
574          i0 = j;
575          do
576          {
577             /*  index calculation for the input as, */
578             /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
579             i1 = i0 + n2;
580             i2 = i1 + n2;
581             i3 = i2 + n2;
582 
583             /* xa + xc */
584             r1 = (_Float16)pSrc[(2U * i0)] + (_Float16)pSrc[(2U * i2)];
585 
586             /* xa - xc */
587             r2 = (_Float16)pSrc[(2U * i0)] - (_Float16)pSrc[(2U * i2)];
588 
589             /* ya + yc */
590             s1 = (_Float16)pSrc[(2U * i0) + 1U] + (_Float16)pSrc[(2U * i2) + 1U];
591 
592             /* ya - yc */
593             s2 = (_Float16)pSrc[(2U * i0) + 1U] - (_Float16)pSrc[(2U * i2) + 1U];
594 
595             /* xb + xd */
596             t1 = (_Float16)pSrc[2U * i1] + (_Float16)pSrc[2U * i3];
597 
598             /* xa' = xa + xb + xc + xd */
599             pSrc[2U * i0] = (_Float16)r1 + (_Float16)t1;
600 
601             /* xa + xc -(xb + xd) */
602             r1 = (_Float16)r1 - (_Float16)t1;
603 
604             /* yb + yd */
605             t2 = (_Float16)pSrc[(2U * i1) + 1U] + (_Float16)pSrc[(2U * i3) + 1U];
606 
607             /* ya' = ya + yb + yc + yd */
608             pSrc[(2U * i0) + 1U] = (_Float16)s1 + (_Float16)t2;
609 
610             /* (ya + yc) - (yb + yd) */
611             s1 = (_Float16)s1 - (_Float16)t2;
612 
613             /* (yb - yd) */
614             t1 = (_Float16)pSrc[(2U * i1) + 1U] - (_Float16)pSrc[(2U * i3) + 1U];
615 
616             /* (xb - xd) */
617             t2 = (_Float16)pSrc[2U * i1] - (_Float16)pSrc[2U * i3];
618 
619             /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
620             pSrc[2U * i1] = ((_Float16)r1 * (_Float16)co2) + ((_Float16)s1 * (_Float16)si2);
621 
622             /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
623             pSrc[(2U * i1) + 1U] = ((_Float16)s1 * (_Float16)co2) - ((_Float16)r1 * (_Float16)si2);
624 
625             /* (xa - xc) + (yb - yd) */
626             r1 = (_Float16)r2 + (_Float16)t1;
627 
628             /* (xa - xc) - (yb - yd) */
629             r2 = (_Float16)r2 - (_Float16)t1;
630 
631             /* (ya - yc) -  (xb - xd) */
632             s1 = (_Float16)s2 - (_Float16)t2;
633 
634             /* (ya - yc) +  (xb - xd) */
635             s2 = (_Float16)s2 + (_Float16)t2;
636 
637             /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
638             pSrc[2U * i2] = ((_Float16)r1 * (_Float16)co1) + ((_Float16)s1 * (_Float16)si1);
639 
640             /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
641             pSrc[(2U * i2) + 1U] = ((_Float16)s1 * (_Float16)co1) - ((_Float16)r1 * (_Float16)si1);
642 
643             /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
644             pSrc[2U * i3] = ((_Float16)r2 * (_Float16)co3) + ((_Float16)s2 * (_Float16)si3);
645 
646             /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
647             pSrc[(2U * i3) + 1U] = ((_Float16)s2 * (_Float16)co3) - ((_Float16)r2 * (_Float16)si3);
648 
649             i0 += n1;
650          } while ( i0 < fftLen);
651          j++;
652       } while (j <= (n2 - 1U));
653       twidCoefModifier <<= 2U;
654    }
655 
656 #endif /* #if defined (ARM_MATH_DSP) */
657 
658 }
659 
660 /*
661 * @brief  Core function for the floating-point CIFFT butterfly process.
662 * @param[in, out] *pSrc            points to the in-place buffer of floating-point data type.
663 * @param[in]      fftLen           length of the FFT.
664 * @param[in]      *pCoef           points to twiddle coefficient buffer.
665 * @param[in]      twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
666 * @param[in]      onebyfftLen      value of 1/fftLen.
667 */
668 
arm_radix4_butterfly_inverse_f16(float16_t * pSrc,uint16_t fftLen,const float16_t * pCoef,uint16_t twidCoefModifier,float16_t onebyfftLen)669 ARM_DSP_ATTRIBUTE void arm_radix4_butterfly_inverse_f16(
670 float16_t * pSrc,
671 uint16_t fftLen,
672 const float16_t * pCoef,
673 uint16_t twidCoefModifier,
674 float16_t onebyfftLen)
675 {
676    float16_t co1, co2, co3, si1, si2, si3;
677    uint32_t ia1, ia2, ia3;
678    uint32_t i0, i1, i2, i3;
679    uint32_t n1, n2, j, k;
680 
681 #if defined (ARM_MATH_DSP)
682 
683    float16_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
684    float16_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
685    Ybminusd;
686    float16_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
687    float16_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
688    float16_t *ptr1;
689    float16_t p0,p1,p2,p3,p4,p5,p6,p7;
690    float16_t a0,a1,a2,a3,a4,a5,a6,a7;
691 
692 
693    /*  Initializations for the first stage */
694    n2 = fftLen;
695    n1 = n2;
696 
697    /* n2 = fftLen/4 */
698    n2 >>= 2U;
699    i0 = 0U;
700    ia1 = 0U;
701 
702    j = n2;
703 
704    /*  Calculation of first stage */
705    do
706    {
707       /*  index calculation for the input as, */
708       /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
709       i1 = i0 + n2;
710       i2 = i1 + n2;
711       i3 = i2 + n2;
712 
713       /*  Butterfly implementation */
714       xaIn = pSrc[(2U * i0)];
715       yaIn = pSrc[(2U * i0) + 1U];
716 
717       xcIn = pSrc[(2U * i2)];
718       ycIn = pSrc[(2U * i2) + 1U];
719 
720       xbIn = pSrc[(2U * i1)];
721       ybIn = pSrc[(2U * i1) + 1U];
722 
723       xdIn = pSrc[(2U * i3)];
724       ydIn = pSrc[(2U * i3) + 1U];
725 
726       /* xa + xc */
727       Xaplusc = (_Float16)xaIn + (_Float16)xcIn;
728       /* xb + xd */
729       Xbplusd = (_Float16)xbIn + (_Float16)xdIn;
730       /* ya + yc */
731       Yaplusc = (_Float16)yaIn + (_Float16)ycIn;
732       /* yb + yd */
733       Ybplusd = (_Float16)ybIn + (_Float16)ydIn;
734 
735       /*  index calculation for the coefficients */
736       ia2 = ia1 + ia1;
737       co2 = pCoef[ia2 * 2U];
738       si2 = pCoef[(ia2 * 2U) + 1U];
739 
740       /* xa - xc */
741       Xaminusc = (_Float16)xaIn - (_Float16)xcIn;
742       /* xb - xd */
743       Xbminusd = (_Float16)xbIn - (_Float16)xdIn;
744       /* ya - yc */
745       Yaminusc = (_Float16)yaIn - (_Float16)ycIn;
746       /* yb - yd */
747       Ybminusd = (_Float16)ybIn - (_Float16)ydIn;
748 
749       /* xa' = xa + xb + xc + xd */
750       pSrc[(2U * i0)] = (_Float16)Xaplusc + (_Float16)Xbplusd;
751 
752       /* ya' = ya + yb + yc + yd */
753       pSrc[(2U * i0) + 1U] = (_Float16)Yaplusc + (_Float16)Ybplusd;
754 
755       /* (xa - xc) - (yb - yd) */
756       Xb12C_out = ((_Float16)Xaminusc - (_Float16)Ybminusd);
757       /* (ya - yc) + (xb - xd) */
758       Yb12C_out = ((_Float16)Yaminusc + (_Float16)Xbminusd);
759       /* (xa + xc) - (xb + xd) */
760       Xc12C_out = ((_Float16)Xaplusc - (_Float16)Xbplusd);
761       /* (ya + yc) - (yb + yd) */
762       Yc12C_out = ((_Float16)Yaplusc - (_Float16)Ybplusd);
763       /* (xa - xc) + (yb - yd) */
764       Xd12C_out = ((_Float16)Xaminusc + (_Float16)Ybminusd);
765       /* (ya - yc) - (xb - xd) */
766       Yd12C_out = ((_Float16)Yaminusc - (_Float16)Xbminusd);
767 
768       co1 = pCoef[ia1 * 2U];
769       si1 = pCoef[(ia1 * 2U) + 1U];
770 
771       /*  index calculation for the coefficients */
772       ia3 = ia2 + ia1;
773       co3 = pCoef[ia3 * 2U];
774       si3 = pCoef[(ia3 * 2U) + 1U];
775 
776       Xb12_out = (_Float16)Xb12C_out * (_Float16)co1;
777       Yb12_out = (_Float16)Yb12C_out * (_Float16)co1;
778       Xc12_out = (_Float16)Xc12C_out * (_Float16)co2;
779       Yc12_out = (_Float16)Yc12C_out * (_Float16)co2;
780       Xd12_out = (_Float16)Xd12C_out * (_Float16)co3;
781       Yd12_out = (_Float16)Yd12C_out * (_Float16)co3;
782 
783       /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
784       //Xb12_out -= Yb12C_out * si1;
785       p0 = (_Float16)Yb12C_out * (_Float16)si1;
786       /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
787       //Yb12_out += Xb12C_out * si1;
788       p1 = (_Float16)Xb12C_out * (_Float16)si1;
789       /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
790       //Xc12_out -= Yc12C_out * si2;
791       p2 = (_Float16)Yc12C_out * (_Float16)si2;
792       /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
793       //Yc12_out += Xc12C_out * si2;
794       p3 = (_Float16)Xc12C_out * (_Float16)si2;
795       /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
796       //Xd12_out -= Yd12C_out * si3;
797       p4 = (_Float16)Yd12C_out * (_Float16)si3;
798       /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
799       //Yd12_out += Xd12C_out * si3;
800       p5 =(_Float16) Xd12C_out * (_Float16)si3;
801 
802       Xb12_out -= (_Float16)p0;
803       Yb12_out += (_Float16)p1;
804       Xc12_out -= (_Float16)p2;
805       Yc12_out += (_Float16)p3;
806       Xd12_out -= (_Float16)p4;
807       Yd12_out += (_Float16)p5;
808 
809       /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
810       pSrc[2U * i1] = Xc12_out;
811 
812       /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
813       pSrc[(2U * i1) + 1U] = Yc12_out;
814 
815       /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
816       pSrc[2U * i2] = Xb12_out;
817 
818       /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
819       pSrc[(2U * i2) + 1U] = Yb12_out;
820 
821       /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
822       pSrc[2U * i3] = Xd12_out;
823 
824       /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
825       pSrc[(2U * i3) + 1U] = Yd12_out;
826 
827       /*  Twiddle coefficients index modifier */
828       ia1 = ia1 + twidCoefModifier;
829 
830       /*  Updating input index */
831       i0 = i0 + 1U;
832 
833    } while (--j);
834 
835    twidCoefModifier <<= 2U;
836 
837    /*  Calculation of second stage to excluding last stage */
838    for (k = fftLen >> 2U; k > 4U; k >>= 2U)
839    {
840       /*  Initializations for the first stage */
841       n1 = n2;
842       n2 >>= 2U;
843       ia1 = 0U;
844 
845       /*  Calculation of first stage */
846       j = 0;
847       do
848       {
849          /*  index calculation for the coefficients */
850          ia2 = ia1 + ia1;
851          ia3 = ia2 + ia1;
852          co1 = pCoef[ia1 * 2U];
853          si1 = pCoef[(ia1 * 2U) + 1U];
854          co2 = pCoef[ia2 * 2U];
855          si2 = pCoef[(ia2 * 2U) + 1U];
856          co3 = pCoef[ia3 * 2U];
857          si3 = pCoef[(ia3 * 2U) + 1U];
858 
859          /*  Twiddle coefficients index modifier */
860          ia1 = ia1 + twidCoefModifier;
861 
862          i0 = j;
863          do
864          {
865             /*  index calculation for the input as, */
866             /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
867             i1 = i0 + n2;
868             i2 = i1 + n2;
869             i3 = i2 + n2;
870 
871             xaIn = pSrc[(2U * i0)];
872             yaIn = pSrc[(2U * i0) + 1U];
873 
874             xbIn = pSrc[(2U * i1)];
875             ybIn = pSrc[(2U * i1) + 1U];
876 
877             xcIn = pSrc[(2U * i2)];
878             ycIn = pSrc[(2U * i2) + 1U];
879 
880             xdIn = pSrc[(2U * i3)];
881             ydIn = pSrc[(2U * i3) + 1U];
882 
883             /* xa - xc */
884             Xaminusc = (_Float16)xaIn - (_Float16)xcIn;
885             /* (xb - xd) */
886             Xbminusd = (_Float16)xbIn - (_Float16)xdIn;
887             /* ya - yc */
888             Yaminusc = (_Float16)yaIn - (_Float16)ycIn;
889             /* (yb - yd) */
890             Ybminusd = (_Float16)ybIn - (_Float16)ydIn;
891 
892             /* xa + xc */
893             Xaplusc = (_Float16)xaIn + (_Float16)xcIn;
894             /* xb + xd */
895             Xbplusd = (_Float16)xbIn + (_Float16)xdIn;
896             /* ya + yc */
897             Yaplusc = (_Float16)yaIn + (_Float16)ycIn;
898             /* yb + yd */
899             Ybplusd = (_Float16)ybIn + (_Float16)ydIn;
900 
901             /* (xa - xc) - (yb - yd) */
902             Xb12C_out = ((_Float16)Xaminusc - (_Float16)Ybminusd);
903             /* (ya - yc) +  (xb - xd) */
904             Yb12C_out = ((_Float16)Yaminusc + (_Float16)Xbminusd);
905             /* xa + xc -(xb + xd) */
906             Xc12C_out = ((_Float16)Xaplusc - (_Float16)Xbplusd);
907             /* (ya + yc) - (yb + yd) */
908             Yc12C_out = ((_Float16)Yaplusc - (_Float16)Ybplusd);
909             /* (xa - xc) + (yb - yd) */
910             Xd12C_out = ((_Float16)Xaminusc + (_Float16)Ybminusd);
911             /* (ya - yc) -  (xb - xd) */
912             Yd12C_out = ((_Float16)Yaminusc - (_Float16)Xbminusd);
913 
914             pSrc[(2U * i0)] = (_Float16)Xaplusc + (_Float16)Xbplusd;
915             pSrc[(2U * i0) + 1U] = (_Float16)Yaplusc + (_Float16)Ybplusd;
916 
917             Xb12_out = (_Float16)Xb12C_out * (_Float16)co1;
918             Yb12_out = (_Float16)Yb12C_out * (_Float16)co1;
919             Xc12_out = (_Float16)Xc12C_out * (_Float16)co2;
920             Yc12_out = (_Float16)Yc12C_out * (_Float16)co2;
921             Xd12_out = (_Float16)Xd12C_out * (_Float16)co3;
922             Yd12_out = (_Float16)Yd12C_out * (_Float16)co3;
923 
924             /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
925             //Xb12_out -= Yb12C_out * si1;
926             p0 = (_Float16)Yb12C_out * (_Float16)si1;
927             /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
928             //Yb12_out += Xb12C_out * si1;
929             p1 = (_Float16)Xb12C_out * (_Float16)si1;
930             /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
931             //Xc12_out -= Yc12C_out * si2;
932             p2 = (_Float16)Yc12C_out * (_Float16)si2;
933             /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
934             //Yc12_out += Xc12C_out * si2;
935             p3 = (_Float16)Xc12C_out * (_Float16)si2;
936             /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
937             //Xd12_out -= Yd12C_out * si3;
938             p4 = (_Float16)Yd12C_out * (_Float16)si3;
939             /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
940             //Yd12_out += Xd12C_out * si3;
941             p5 = (_Float16)Xd12C_out * (_Float16)si3;
942 
943             Xb12_out -= (_Float16)p0;
944             Yb12_out += (_Float16)p1;
945             Xc12_out -= (_Float16)p2;
946             Yc12_out += (_Float16)p3;
947             Xd12_out -= (_Float16)p4;
948             Yd12_out += (_Float16)p5;
949 
950             /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
951             pSrc[2U * i1] = Xc12_out;
952 
953             /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
954             pSrc[(2U * i1) + 1U] = Yc12_out;
955 
956             /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
957             pSrc[2U * i2] = Xb12_out;
958 
959             /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
960             pSrc[(2U * i2) + 1U] = Yb12_out;
961 
962             /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
963             pSrc[2U * i3] = Xd12_out;
964 
965             /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
966             pSrc[(2U * i3) + 1U] = Yd12_out;
967 
968             i0 += n1;
969          } while (i0 < fftLen);
970          j++;
971       } while (j <= (n2 - 1U));
972       twidCoefModifier <<= 2U;
973    }
974    /*  Initializations of last stage */
975 
976    j = fftLen >> 2;
977    ptr1 = &pSrc[0];
978 
979    /*  Calculations of last stage */
980    do
981    {
982       xaIn = ptr1[0];
983       yaIn = ptr1[1];
984       xbIn = ptr1[2];
985       ybIn = ptr1[3];
986       xcIn = ptr1[4];
987       ycIn = ptr1[5];
988       xdIn = ptr1[6];
989       ydIn = ptr1[7];
990 
991       /*  Butterfly implementation */
992       /* xa + xc */
993       Xaplusc = (_Float16)xaIn + (_Float16)xcIn;
994 
995       /* xa - xc */
996       Xaminusc = (_Float16)xaIn - (_Float16)xcIn;
997 
998       /* ya + yc */
999       Yaplusc = (_Float16)yaIn + (_Float16)ycIn;
1000 
1001       /* ya - yc */
1002       Yaminusc = (_Float16)yaIn - (_Float16)ycIn;
1003 
1004       /* xb + xd */
1005       Xbplusd = (_Float16)xbIn + (_Float16)xdIn;
1006 
1007       /* yb + yd */
1008       Ybplusd = (_Float16)ybIn + (_Float16)ydIn;
1009 
1010       /* (xb-xd) */
1011       Xbminusd = (_Float16)xbIn - (_Float16)xdIn;
1012 
1013       /* (yb-yd) */
1014       Ybminusd = (_Float16)ybIn - (_Float16)ydIn;
1015 
1016       /* xa' = (xa+xb+xc+xd) * onebyfftLen */
1017       a0 = ((_Float16)Xaplusc + (_Float16)Xbplusd);
1018       /* ya' = (ya+yb+yc+yd) * onebyfftLen */
1019       a1 = ((_Float16)Yaplusc + (_Float16)Ybplusd);
1020       /* xc' = (xa-xb+xc-xd) * onebyfftLen */
1021       a2 = ((_Float16)Xaplusc - (_Float16)Xbplusd);
1022       /* yc' = (ya-yb+yc-yd) * onebyfftLen  */
1023       a3 = ((_Float16)Yaplusc - (_Float16)Ybplusd);
1024       /* xb' = (xa-yb-xc+yd) * onebyfftLen */
1025       a4 = ((_Float16)Xaminusc - (_Float16)Ybminusd);
1026       /* yb' = (ya+xb-yc-xd) * onebyfftLen */
1027       a5 = ((_Float16)Yaminusc + (_Float16)Xbminusd);
1028       /* xd' = (xa-yb-xc+yd) * onebyfftLen */
1029       a6 = ((_Float16)Xaminusc + (_Float16)Ybminusd);
1030       /* yd' = (ya-xb-yc+xd) * onebyfftLen */
1031       a7 = ((_Float16)Yaminusc - (_Float16)Xbminusd);
1032 
1033       p0 = (_Float16)a0 * (_Float16)onebyfftLen;
1034       p1 = (_Float16)a1 * (_Float16)onebyfftLen;
1035       p2 = (_Float16)a2 * (_Float16)onebyfftLen;
1036       p3 = (_Float16)a3 * (_Float16)onebyfftLen;
1037       p4 = (_Float16)a4 * (_Float16)onebyfftLen;
1038       p5 = (_Float16)a5 * (_Float16)onebyfftLen;
1039       p6 = (_Float16)a6 * (_Float16)onebyfftLen;
1040       p7 = (_Float16)a7 * (_Float16)onebyfftLen;
1041 
1042       /* xa' = (xa+xb+xc+xd) * onebyfftLen */
1043       ptr1[0] = p0;
1044       /* ya' = (ya+yb+yc+yd) * onebyfftLen */
1045       ptr1[1] = p1;
1046       /* xc' = (xa-xb+xc-xd) * onebyfftLen */
1047       ptr1[2] = p2;
1048       /* yc' = (ya-yb+yc-yd) * onebyfftLen  */
1049       ptr1[3] = p3;
1050       /* xb' = (xa-yb-xc+yd) * onebyfftLen */
1051       ptr1[4] = p4;
1052       /* yb' = (ya+xb-yc-xd) * onebyfftLen */
1053       ptr1[5] = p5;
1054       /* xd' = (xa-yb-xc+yd) * onebyfftLen */
1055       ptr1[6] = p6;
1056       /* yd' = (ya-xb-yc+xd) * onebyfftLen */
1057       ptr1[7] = p7;
1058 
1059       /* increment source pointer by 8 for next calculations */
1060       ptr1 = ptr1 + 8U;
1061 
1062    } while (--j);
1063 
1064 #else
1065 
1066    float16_t t1, t2, r1, r2, s1, s2;
1067 
1068    /* Run the below code for Cortex-M0 */
1069 
1070    /*  Initializations for the first stage */
1071    n2 = fftLen;
1072    n1 = n2;
1073 
1074    /*  Calculation of first stage */
1075    for (k = fftLen; k > 4U; k >>= 2U)
1076    {
1077       /*  Initializations for the first stage */
1078       n1 = n2;
1079       n2 >>= 2U;
1080       ia1 = 0U;
1081 
1082       /*  Calculation of first stage */
1083       j = 0;
1084       do
1085       {
1086          /*  index calculation for the coefficients */
1087          ia2 = ia1 + ia1;
1088          ia3 = ia2 + ia1;
1089          co1 = pCoef[ia1 * 2U];
1090          si1 = pCoef[(ia1 * 2U) + 1U];
1091          co2 = pCoef[ia2 * 2U];
1092          si2 = pCoef[(ia2 * 2U) + 1U];
1093          co3 = pCoef[ia3 * 2U];
1094          si3 = pCoef[(ia3 * 2U) + 1U];
1095 
1096          /*  Twiddle coefficients index modifier */
1097          ia1 = ia1 + twidCoefModifier;
1098 
1099          i0 = j;
1100          do
1101          {
1102             /*  index calculation for the input as, */
1103             /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
1104             i1 = i0 + n2;
1105             i2 = i1 + n2;
1106             i3 = i2 + n2;
1107 
1108             /* xa + xc */
1109             r1 = (_Float16)pSrc[(2U * i0)] + (_Float16)pSrc[(2U * i2)];
1110 
1111             /* xa - xc */
1112             r2 = (_Float16)pSrc[(2U * i0)] - (_Float16)pSrc[(2U * i2)];
1113 
1114             /* ya + yc */
1115             s1 = (_Float16)pSrc[(2U * i0) + 1U] + (_Float16)pSrc[(2U * i2) + 1U];
1116 
1117             /* ya - yc */
1118             s2 = (_Float16)pSrc[(2U * i0) + 1U] - (_Float16)pSrc[(2U * i2) + 1U];
1119 
1120             /* xb + xd */
1121             t1 = (_Float16)pSrc[2U * i1] + (_Float16)pSrc[2U * i3];
1122 
1123             /* xa' = xa + xb + xc + xd */
1124             pSrc[2U * i0] = (_Float16)r1 + (_Float16)t1;
1125 
1126             /* xa + xc -(xb + xd) */
1127             r1 = (_Float16)r1 - (_Float16)t1;
1128 
1129             /* yb + yd */
1130             t2 = (_Float16)pSrc[(2U * i1) + 1U] + (_Float16)pSrc[(2U * i3) + 1U];
1131 
1132             /* ya' = ya + yb + yc + yd */
1133             pSrc[(2U * i0) + 1U] = (_Float16)s1 + (_Float16)t2;
1134 
1135             /* (ya + yc) - (yb + yd) */
1136             s1 = (_Float16)s1 - (_Float16)t2;
1137 
1138             /* (yb - yd) */
1139             t1 = (_Float16)pSrc[(2U * i1) + 1U] - (_Float16)pSrc[(2U * i3) + 1U];
1140 
1141             /* (xb - xd) */
1142             t2 = (_Float16)pSrc[2U * i1] - (_Float16)pSrc[2U * i3];
1143 
1144             /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1145             pSrc[2U * i1] = ((_Float16)r1 * (_Float16)co2) - ((_Float16)s1 * (_Float16)si2);
1146 
1147             /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1148             pSrc[(2U * i1) + 1U] = ((_Float16)s1 * (_Float16)co2) + ((_Float16)r1 * (_Float16)si2);
1149 
1150             /* (xa - xc) - (yb - yd) */
1151             r1 = (_Float16)r2 - (_Float16)t1;
1152 
1153             /* (xa - xc) + (yb - yd) */
1154             r2 = (_Float16)r2 + (_Float16)t1;
1155 
1156             /* (ya - yc) +  (xb - xd) */
1157             s1 = (_Float16)s2 + (_Float16)t2;
1158 
1159             /* (ya - yc) -  (xb - xd) */
1160             s2 = (_Float16)s2 - (_Float16)t2;
1161 
1162             /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1163             pSrc[2U * i2] = ((_Float16)r1 * (_Float16)co1) - ((_Float16)s1 * (_Float16)si1);
1164 
1165             /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1166             pSrc[(2U * i2) + 1U] = ((_Float16)s1 * (_Float16)co1) + ((_Float16)r1 * (_Float16)si1);
1167 
1168             /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1169             pSrc[2U * i3] = ((_Float16)r2 * (_Float16)co3) - ((_Float16)s2 * (_Float16)si3);
1170 
1171             /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1172             pSrc[(2U * i3) + 1U] = ((_Float16)s2 * (_Float16)co3) + ((_Float16)r2 * (_Float16)si3);
1173 
1174             i0 += n1;
1175          } while ( i0 < fftLen);
1176          j++;
1177       } while (j <= (n2 - 1U));
1178       twidCoefModifier <<= 2U;
1179    }
1180    /*  Initializations of last stage */
1181    n1 = n2;
1182    n2 >>= 2U;
1183 
1184    /*  Calculations of last stage */
1185    for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
1186    {
1187       /*  index calculation for the input as, */
1188       /*  pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
1189       i1 = i0 + n2;
1190       i2 = i1 + n2;
1191       i3 = i2 + n2;
1192 
1193       /*  Butterfly implementation */
1194       /* xa + xc */
1195       r1 = (_Float16)pSrc[2U * i0] + (_Float16)pSrc[2U * i2];
1196 
1197       /* xa - xc */
1198       r2 = (_Float16)pSrc[2U * i0] - (_Float16)pSrc[2U * i2];
1199 
1200       /* ya + yc */
1201       s1 = (_Float16)pSrc[(2U * i0) + 1U] + (_Float16)pSrc[(2U * i2) + 1U];
1202 
1203       /* ya - yc */
1204       s2 = (_Float16)pSrc[(2U * i0) + 1U] - (_Float16)pSrc[(2U * i2) + 1U];
1205 
1206       /* xc + xd */
1207       t1 = (_Float16)pSrc[2U * i1] + (_Float16)pSrc[2U * i3];
1208 
1209       /* xa' = xa + xb + xc + xd */
1210       pSrc[2U * i0] = ((_Float16)r1 + (_Float16)t1) * (_Float16)onebyfftLen;
1211 
1212       /* (xa + xb) - (xc + xd) */
1213       r1 = (_Float16)r1 - (_Float16)t1;
1214 
1215       /* yb + yd */
1216       t2 = (_Float16)pSrc[(2U * i1) + 1U] + (_Float16)pSrc[(2U * i3) + 1U];
1217 
1218       /* ya' = ya + yb + yc + yd */
1219       pSrc[(2U * i0) + 1U] = ((_Float16)s1 + (_Float16)t2) * (_Float16)onebyfftLen;
1220 
1221       /* (ya + yc) - (yb + yd) */
1222       s1 = (_Float16)s1 - (_Float16)t2;
1223 
1224       /* (yb-yd) */
1225       t1 = (_Float16)pSrc[(2U * i1) + 1U] - (_Float16)pSrc[(2U * i3) + 1U];
1226 
1227       /* (xb-xd) */
1228       t2 = (_Float16)pSrc[2U * i1] - (_Float16)pSrc[2U * i3];
1229 
1230       /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
1231       pSrc[2U * i1] = (_Float16)r1 * (_Float16)onebyfftLen;
1232 
1233       /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
1234       pSrc[(2U * i1) + 1U] = (_Float16)s1 * (_Float16)onebyfftLen;
1235 
1236       /* (xa - xc) - (yb-yd) */
1237       r1 = (_Float16)r2 - (_Float16)t1;
1238 
1239       /* (xa - xc) + (yb-yd) */
1240       r2 = (_Float16)r2 + (_Float16)t1;
1241 
1242       /* (ya - yc) + (xb-xd) */
1243       s1 = (_Float16)s2 + (_Float16)t2;
1244 
1245       /* (ya - yc) - (xb-xd) */
1246       s2 = (_Float16)s2 - (_Float16)t2;
1247 
1248       /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
1249       pSrc[2U * i2] = (_Float16)r1 * (_Float16)onebyfftLen;
1250 
1251       /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
1252       pSrc[(2U * i2) + 1U] = (_Float16)s1 * (_Float16)onebyfftLen;
1253 
1254       /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
1255       pSrc[2U * i3] = (_Float16)r2 * (_Float16)onebyfftLen;
1256 
1257       /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
1258       pSrc[(2U * i3) + 1U] = (_Float16)s2 * (_Float16)onebyfftLen;
1259    }
1260 
1261 #endif /* #if defined (ARM_MATH_DSP) */
1262 }
1263 
1264 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
1265