1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_rfft_fast_f16.c
4  * Description:  RFFT & RIFFT Floating point process function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions_f16.h"
30 #include "arm_common_tables_f16.h"
31 
32 #if defined(ARM_FLOAT16_SUPPORTED)
33 
34 
35 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
36 
stage_rfft_f16(const arm_rfft_fast_instance_f16 * S,const float16_t * p,float16_t * pOut)37 static void stage_rfft_f16(
38   const arm_rfft_fast_instance_f16 * S,
39   const float16_t * p,
40         float16_t * pOut)
41 {
42         int32_t  k;                                /* Loop Counter */
43         float16_t twR, twI;                         /* RFFT Twiddle coefficients */
44   const float16_t * pCoeff = S->pTwiddleRFFT;       /* Points to RFFT Twiddle factors */
45   const float16_t *pA = p;                          /* increasing pointer */
46   const float16_t *pB = p;                          /* decreasing pointer */
47         float16_t xAR, xAI, xBR, xBI;               /* temporary variables */
48         float16_t t1a, t1b;                         /* temporary variables */
49         float16_t p0, p1, p2, p3;                   /* temporary variables */
50 
51         float16x8x2_t tw,xA,xB;
52         float16x8x2_t tmp1, tmp2, res;
53 
54         uint16x8_t     vecStridesBkwd;
55 
56         vecStridesBkwd = vddupq_u16((uint16_t)14, 2);
57 
58 
59         int blockCnt;
60 
61 
62    k = (S->Sint).fftLen - 1;
63 
64    /* Pack first and last sample of the frequency domain together */
65 
66    xBR = pB[0];
67    xBI = pB[1];
68    xAR = pA[0];
69    xAI = pA[1];
70 
71    twR = *pCoeff++ ;
72    twI = *pCoeff++ ;
73 
74    // U1 = XA(1) + XB(1); % It is real
75    t1a = (_Float16)xBR + (_Float16)xAR  ;
76 
77    // U2 = XB(1) - XA(1); % It is imaginary
78    t1b = (_Float16)xBI + (_Float16)xAI  ;
79 
80    // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
81    // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
82    *pOut++ = 0.5f16 * ( (_Float16)t1a + (_Float16)t1b );
83    *pOut++ = 0.5f16 * ( (_Float16)t1a - (_Float16)t1b );
84 
85    // XA(1) = 1/2*( U1 - imag(U2) +  i*( U1 +imag(U2) ));
86    pB  = p + 2*k - 14;
87    pA += 2;
88 
89    blockCnt = k >> 3;
90    while (blockCnt > 0)
91    {
92       /*
93          function X = my_split_rfft(X, ifftFlag)
94          % X is a series of real numbers
95          L  = length(X);
96          XC = X(1:2:end) +i*X(2:2:end);
97          XA = fft(XC);
98          XB = conj(XA([1 end:-1:2]));
99          TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
100          for l = 2:L/2
101             XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
102          end
103          XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
104          X = XA;
105       */
106 
107 
108       xA = vld2q_f16(pA);
109       pA += 16;
110 
111       xB = vld2q_f16(pB);
112 
113       xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd);
114       xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd);
115 
116       xB.val[1] = vnegq_f16(xB.val[1]);
117       pB -= 16;
118 
119 
120       tw = vld2q_f16(pCoeff);
121       pCoeff += 16;
122 
123 
124       tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]);
125       tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]);
126 
127       tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]);
128       tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]);
129 
130       res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
131       res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
132 
133       res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
134       res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
135 
136       res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] );
137       res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] );
138 
139       res.val[0] = vmulq_n_f16(res.val[0], 0.5f);
140       res.val[1] = vmulq_n_f16(res.val[1], 0.5f);
141 
142 
143       vst2q_f16(pOut, res);
144       pOut += 16;
145 
146 
147       blockCnt--;
148    }
149 
150    pB += 14;
151    blockCnt = k & 7;
152    while (blockCnt > 0)
153    {
154       /*
155          function X = my_split_rfft(X, ifftFlag)
156          % X is a series of real numbers
157          L  = length(X);
158          XC = X(1:2:end) +i*X(2:2:end);
159          XA = fft(XC);
160          XB = conj(XA([1 end:-1:2]));
161          TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
162          for l = 2:L/2
163             XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
164          end
165          XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
166          X = XA;
167       */
168 
169       xBI = pB[1];
170       xBR = pB[0];
171       xAR = pA[0];
172       xAI = pA[1];
173 
174       twR = *pCoeff++;
175       twI = *pCoeff++;
176 
177       t1a = (_Float16)xBR - (_Float16)xAR ;
178       t1b = (_Float16)xBI + (_Float16)xAI ;
179 
180       // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
181       // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
182       p0 = (_Float16)twR * (_Float16)t1a;
183       p1 = (_Float16)twI * (_Float16)t1a;
184       p2 = (_Float16)twR * (_Float16)t1b;
185       p3 = (_Float16)twI * (_Float16)t1b;
186 
187       *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR + (_Float16)p0 + (_Float16)p3 ); //xAR
188       *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)p1 - (_Float16)p2 ); //xAI
189 
190       pA += 2;
191       pB -= 2;
192       blockCnt--;
193    }
194 }
195 
196 /* Prepares data for inverse cfft */
merge_rfft_f16(const arm_rfft_fast_instance_f16 * S,const float16_t * p,float16_t * pOut)197 static void merge_rfft_f16(
198   const arm_rfft_fast_instance_f16 * S,
199   const float16_t * p,
200         float16_t * pOut)
201 {
202         int32_t  k;                                /* Loop Counter */
203         float16_t twR, twI;                         /* RFFT Twiddle coefficients */
204   const float16_t *pCoeff = S->pTwiddleRFFT;        /* Points to RFFT Twiddle factors */
205   const float16_t *pA = p;                          /* increasing pointer */
206   const float16_t *pB = p;                          /* decreasing pointer */
207         float16_t xAR, xAI, xBR, xBI;               /* temporary variables */
208         float16_t t1a, t1b, r, s, t, u;             /* temporary variables */
209 
210         float16x8x2_t tw,xA,xB;
211         float16x8x2_t tmp1, tmp2, res;
212         uint16x8_t     vecStridesBkwd;
213 
214         vecStridesBkwd = vddupq_u16((uint16_t)14, 2);
215 
216         int blockCnt;
217 
218 
219    k = (S->Sint).fftLen - 1;
220 
221    xAR = pA[0];
222    xAI = pA[1];
223 
224    pCoeff += 2 ;
225 
226    *pOut++ = 0.5f16 * ( (_Float16)xAR + (_Float16)xAI );
227    *pOut++ = 0.5f16 * ( (_Float16)xAR - (_Float16)xAI );
228 
229    pB  =  p + 2*k - 14;
230    pA +=  2    ;
231 
232    blockCnt = k >> 3;
233    while (blockCnt > 0)
234    {
235       /* G is half of the frequency complex spectrum */
236       //for k = 2:N
237       //    Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
238       xA = vld2q_f16(pA);
239       pA += 16;
240 
241       xB = vld2q_f16(pB);
242 
243       xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd);
244       xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd);
245 
246       xB.val[1] = vnegq_f16(xB.val[1]);
247       pB -= 16;
248 
249 
250       tw = vld2q_f16(pCoeff);
251       tw.val[1] = vnegq_f16(tw.val[1]);
252       pCoeff += 16;
253 
254 
255       tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]);
256       tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]);
257 
258       tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]);
259       tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]);
260 
261       res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
262       res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
263 
264       res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
265       res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
266 
267       res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] );
268       res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] );
269 
270       res.val[0] = vmulq_n_f16(res.val[0], 0.5f);
271       res.val[1] = vmulq_n_f16(res.val[1], 0.5f);
272 
273 
274       vst2q_f16(pOut, res);
275       pOut += 16;
276 
277 
278       blockCnt--;
279    }
280 
281    pB += 14;
282    blockCnt = k & 7;
283    while (blockCnt > 0)
284    {
285       /* G is half of the frequency complex spectrum */
286       //for k = 2:N
287       //    Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
288       xBI =   pB[1]    ;
289       xBR =   pB[0]    ;
290       xAR =  pA[0];
291       xAI =  pA[1];
292 
293       twR = *pCoeff++;
294       twI = *pCoeff++;
295 
296       t1a = (_Float16)xAR - (_Float16)xBR ;
297       t1b = (_Float16)xAI + (_Float16)xBI ;
298 
299       r = (_Float16)twR * (_Float16)t1a;
300       s = (_Float16)twI * (_Float16)t1b;
301       t = (_Float16)twI * (_Float16)t1a;
302       u = (_Float16)twR * (_Float16)t1b;
303 
304       // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
305       // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
306       *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR - (_Float16)r - (_Float16)s ); //xAR
307       *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)t - (_Float16)u ); //xAI
308 
309       pA += 2;
310       pB -= 2;
311       blockCnt--;
312    }
313 
314 }
315 #else
stage_rfft_f16(const arm_rfft_fast_instance_f16 * S,const float16_t * p,float16_t * pOut)316 static void stage_rfft_f16(
317   const arm_rfft_fast_instance_f16 * S,
318   const float16_t * p,
319         float16_t * pOut)
320 {
321         int32_t  k;                                /* Loop Counter */
322         float16_t twR, twI;                         /* RFFT Twiddle coefficients */
323   const float16_t * pCoeff = S->pTwiddleRFFT;       /* Points to RFFT Twiddle factors */
324   const float16_t *pA = p;                          /* increasing pointer */
325   const float16_t *pB = p;                          /* decreasing pointer */
326         float16_t xAR, xAI, xBR, xBI;               /* temporary variables */
327         float16_t t1a, t1b;                         /* temporary variables */
328         float16_t p0, p1, p2, p3;                   /* temporary variables */
329 
330 
331    k = (S->Sint).fftLen - 1;
332 
333    /* Pack first and last sample of the frequency domain together */
334 
335    xBR = pB[0];
336    xBI = pB[1];
337    xAR = pA[0];
338    xAI = pA[1];
339 
340    twR = *pCoeff++ ;
341    twI = *pCoeff++ ;
342 
343 
344    // U1 = XA(1) + XB(1); % It is real
345    t1a = (_Float16)xBR + (_Float16)xAR  ;
346 
347    // U2 = XB(1) - XA(1); % It is imaginary
348    t1b = (_Float16)xBI + (_Float16)xAI  ;
349 
350    // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
351    // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
352    *pOut++ = 0.5f16 * ( (_Float16)t1a + (_Float16)t1b );
353    *pOut++ = 0.5f16 * ( (_Float16)t1a - (_Float16)t1b );
354 
355    // XA(1) = 1/2*( U1 - imag(U2) +  i*( U1 +imag(U2) ));
356    pB  = p + 2*k;
357    pA += 2;
358 
359    do
360    {
361       /*
362          function X = my_split_rfft(X, ifftFlag)
363          % X is a series of real numbers
364          L  = length(X);
365          XC = X(1:2:end) +i*X(2:2:end);
366          XA = fft(XC);
367          XB = conj(XA([1 end:-1:2]));
368          TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
369          for l = 2:L/2
370             XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
371          end
372          XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
373          X = XA;
374       */
375 
376       xBI = pB[1];
377       xBR = pB[0];
378       xAR = pA[0];
379       xAI = pA[1];
380 
381       twR = *pCoeff++;
382       twI = *pCoeff++;
383 
384       t1a = (_Float16)xBR - (_Float16)xAR ;
385       t1b = (_Float16)xBI + (_Float16)xAI ;
386 
387       // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
388       // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
389       p0 = (_Float16)twR * (_Float16)t1a;
390       p1 = (_Float16)twI * (_Float16)t1a;
391       p2 = (_Float16)twR * (_Float16)t1b;
392       p3 = (_Float16)twI * (_Float16)t1b;
393 
394       *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR + (_Float16)p0 + (_Float16)p3 ); //xAR
395       *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)p1 - (_Float16)p2 ); //xAI
396 
397 
398       pA += 2;
399       pB -= 2;
400       k--;
401    } while (k > 0);
402 }
403 
404 /* Prepares data for inverse cfft */
merge_rfft_f16(const arm_rfft_fast_instance_f16 * S,const float16_t * p,float16_t * pOut)405 static void merge_rfft_f16(
406   const arm_rfft_fast_instance_f16 * S,
407   const float16_t * p,
408         float16_t * pOut)
409 {
410         int32_t  k;                                /* Loop Counter */
411         float16_t twR, twI;                         /* RFFT Twiddle coefficients */
412   const float16_t *pCoeff = S->pTwiddleRFFT;        /* Points to RFFT Twiddle factors */
413   const float16_t *pA = p;                          /* increasing pointer */
414   const float16_t *pB = p;                          /* decreasing pointer */
415         float16_t xAR, xAI, xBR, xBI;               /* temporary variables */
416         float16_t t1a, t1b, r, s, t, u;             /* temporary variables */
417 
418    k = (S->Sint).fftLen - 1;
419 
420    xAR = pA[0];
421    xAI = pA[1];
422 
423    pCoeff += 2 ;
424 
425    *pOut++ = 0.5f16 * ( (_Float16)xAR + (_Float16)xAI );
426    *pOut++ = 0.5f16 * ( (_Float16)xAR - (_Float16)xAI );
427 
428    pB  =  p + 2*k ;
429    pA +=  2	   ;
430 
431    while (k > 0)
432    {
433       /* G is half of the frequency complex spectrum */
434       //for k = 2:N
435       //    Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
436       xBI =   pB[1]    ;
437       xBR =   pB[0]    ;
438       xAR =  pA[0];
439       xAI =  pA[1];
440 
441       twR = *pCoeff++;
442       twI = *pCoeff++;
443 
444       t1a = (_Float16)xAR - (_Float16)xBR ;
445       t1b = (_Float16)xAI + (_Float16)xBI ;
446 
447       r = (_Float16)twR * (_Float16)t1a;
448       s = (_Float16)twI * (_Float16)t1b;
449       t = (_Float16)twI * (_Float16)t1a;
450       u = (_Float16)twR * (_Float16)t1b;
451 
452       // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
453       // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
454       *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR - (_Float16)r - (_Float16)s ); //xAR
455       *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)t - (_Float16)u ); //xAI
456 
457       pA += 2;
458       pB -= 2;
459       k--;
460    }
461 
462 }
463 
464 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
465 
466 /**
467   @ingroup RealFFT
468 */
469 
470 /**
471   @defgroup RealFFTF16 Real FFT F16 Functions
472 */
473 
474 /**
475   @addtogroup RealFFTF16
476   @{
477 */
478 
479 /**
480   @brief         Processing function for the floating-point real FFT.
481   @param[in]     S         points to an arm_rfft_fast_instance_f16 structure
482   @param[in]     p         points to input buffer (Source buffer is modified by this function.)
483   @param[in]     pOut      points to output buffer
484   @param[in]     ifftFlag
485                    - value = 0: RFFT
486                    - value = 1: RIFFT
487 */
488 
arm_rfft_fast_f16(const arm_rfft_fast_instance_f16 * S,float16_t * p,float16_t * pOut,uint8_t ifftFlag)489 ARM_DSP_ATTRIBUTE void arm_rfft_fast_f16(
490   const arm_rfft_fast_instance_f16 * S,
491   float16_t * p,
492   float16_t * pOut,
493   uint8_t ifftFlag)
494 {
495    const arm_cfft_instance_f16 * Sint = &(S->Sint);
496 
497 
498    /* Calculation of Real FFT */
499    if (ifftFlag)
500    {
501       /*  Real FFT compression */
502       merge_rfft_f16(S, p, pOut);
503       /* Complex radix-4 IFFT process */
504       arm_cfft_f16( Sint, pOut, ifftFlag, 1);
505    }
506    else
507    {
508 
509       /* Calculation of RFFT of input */
510       arm_cfft_f16( Sint, p, ifftFlag, 1);
511 
512       /*  Real FFT extraction */
513       stage_rfft_f16(S, p, pOut);
514    }
515 }
516 
517 /**
518 * @} end of RealFFTF16 group
519 */
520 
521 #endif /*  #if defined(ARM_FLOAT16_SUPPORTED) */
522