1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_rfft_q31.c
4  * Description:  FFT & RIFFT Q31 process function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions.h"
30 
31 /* ----------------------------------------------------------------------
32  * Internal functions prototypes
33  * -------------------------------------------------------------------- */
34 
35 ARM_DSP_ATTRIBUTE void arm_split_rfft_q31(
36         q31_t * pSrc,
37         uint32_t fftLen,
38   const q31_t * pATable,
39   const q31_t * pBTable,
40         q31_t * pDst,
41         uint32_t modifier);
42 
43 ARM_DSP_ATTRIBUTE void arm_split_rifft_q31(
44         q31_t * pSrc,
45         uint32_t fftLen,
46   const q31_t * pATable,
47   const q31_t * pBTable,
48         q31_t * pDst,
49         uint32_t modifier);
50 
51 /**
52   @addtogroup RealFFTQ31
53   @{
54  */
55 
56 /**
57   @brief         Processing function for the Q31 RFFT/RIFFT.
58   @param[in]     S     points to an instance of the Q31 RFFT/RIFFT structure
59   @param[in]     pSrc  points to input buffer (Source buffer is modified by this function)
60   @param[out]    pDst  points to output buffer
61 
62   @par           Input an output formats
63                    Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
64                    Hence the output format is different for different RFFT sizes.
65                    The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
66   @par             Input and Output formats for RFFT Q31
67 
68 | RFFT Size  | Input Format  | Output Format  | Number of bits to upscale |
69 | ---------: | ------------: | -------------: | ------------------------: |
70 | 32         | 1.31          | 6.26           | 5                         |
71 | 64         | 1.31          | 7.25           | 6                         |
72 | 128        | 1.31          | 8.24           | 7                         |
73 | 256        | 1.31          | 9.23           | 8                         |
74 | 512        | 1.31          | 10.22          | 9                         |
75 | 1024       | 1.31          | 11.21          | 10                        |
76 | 2048       | 1.31          | 12.20          | 11                        |
77 | 4096       | 1.31          | 13.19          | 12                        |
78 | 8192       | 1.31          | 14.18          | 13                        |
79 
80   @par             Input and Output formats for RIFFT Q31
81 
82 | RIFFT Size  | Input Format  | Output Format  | Number of bits to upscale |
83 | ----------: | ------------: | -------------: | ------------------------: |
84 | 32          | 1.31          | 6.26           | 0                         |
85 | 64          | 1.31          | 7.25           | 0                         |
86 | 128         | 1.31          | 8.24           | 0                         |
87 | 256         | 1.31          | 9.23           | 0                         |
88 | 512         | 1.31          | 10.22          | 0                         |
89 | 1024        | 1.31          | 11.21          | 0                         |
90 | 2048        | 1.31          | 12.20          | 0                         |
91 | 4096        | 1.31          | 13.19          | 0                         |
92 | 8192        | 1.31          | 14.18          | 0                         |
93 
94   @par
95                    If the input buffer is of length N (fftLenReal), the output buffer must have length 2N
96                    since it is containing the conjugate part (except for MVE version where N+2 is enough).
97                    The input buffer is modified by this function.
98   @par
99                    For the RIFFT, the source buffer must have length N+2 since the Nyquist frequency value
100                    is needed but conjugate part is ignored.
101                    It is not using the packing trick of the float version.
102 
103  */
104 
arm_rfft_q31(const arm_rfft_instance_q31 * S,q31_t * pSrc,q31_t * pDst)105 ARM_DSP_ATTRIBUTE void arm_rfft_q31(
106   const arm_rfft_instance_q31 * S,
107         q31_t * pSrc,
108         q31_t * pDst)
109 {
110 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
111   const arm_cfft_instance_q31 *S_CFFT = &(S->cfftInst);
112 #else
113   const arm_cfft_instance_q31 *S_CFFT = S->pCfft;
114 #endif
115         uint32_t L2 = S->fftLenReal >> 1U;
116 
117   /* Calculation of RIFFT of input */
118   if (S->ifftFlagR == 1U)
119   {
120      /*  Real IFFT core process */
121      arm_split_rifft_q31 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
122 
123      /* Complex IFFT process */
124      arm_cfft_q31 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
125 
126      arm_shift_q31(pDst, 1, pDst, S->fftLenReal);
127   }
128   else
129   {
130      /* Calculation of RFFT of input */
131 
132      /* Complex FFT process */
133      arm_cfft_q31 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
134 
135      /*  Real FFT core process */
136      arm_split_rfft_q31 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
137   }
138 
139 }
140 
141 /**
142   @} end of RealFFTQ31 group
143  */
144 
145 /**
146   @brief         Core Real FFT process
147   @param[in]     pSrc      points to input buffer
148   @param[in]     fftLen    length of FFT
149   @param[in]     pATable   points to twiddle Coef A buffer
150   @param[in]     pBTable   points to twiddle Coef B buffer
151   @param[out]    pDst      points to output buffer
152   @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
153  */
154 
155 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
156 
157 #include "arm_helium_utils.h"
158 #include "arm_vec_fft.h"
159 
160 #if defined(ARM_DSP_BUILT_WITH_GCC)
161 
162 #define MVE_CMPLX_MULT_FX_AxB_S32(A,B)          vqdmladhxq_s32(vqdmlsdhq_s32((__typeof(A))vuninitializedq_s32(), A, B), A, B)
163 #define MVE_CMPLX_MULT_FX_AxConjB_S32(A,B)      vqdmladhq_s32(vqdmlsdhxq_s32((__typeof(A))vuninitializedq_s32(), A, B), A, B)
164 
165 #endif
166 
arm_split_rfft_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pATable,const q31_t * pBTable,q31_t * pDst,uint32_t modifier)167 ARM_DSP_ATTRIBUTE void arm_split_rfft_q31(
168     q31_t       *pSrc,
169     uint32_t     fftLen,
170     const q31_t       *pATable,
171     const q31_t       *pBTable,
172     q31_t       *pDst,
173     uint32_t     modifier)
174 {
175     uint32_t        i;          /* Loop Counter */
176     const q31_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
177     q31_t          *pOut1 = &pDst[2];
178     q31_t          *pIn1 = &pSrc[2];
179     uint32x4_t      offset = { 2, 3, 0, 1 };
180     uint32x4_t      offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 };
181 
182     offset = offset + (2 * fftLen - 4);
183 
184 
185     /* Init coefficient pointers */
186     pCoefA = &pATable[modifier * 2];
187     pCoefB = &pBTable[modifier * 2];
188 
189     const q31_t    *pCoefAb, *pCoefBb;
190     pCoefAb = pCoefA;
191     pCoefBb = pCoefB;
192 
193     pIn1 = &pSrc[2];
194 
195     i = fftLen - 1U;
196     i = i / 2 + 1;
197     while (i > 0U) {
198         q31x4_t         in1 = vld1q_s32(pIn1);
199         q31x4_t         in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset);
200         q31x4_t         coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef);
201         q31x4_t         coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef);
202 #if defined(ARM_DSP_BUILT_WITH_GCC)
203         q31x4_t         out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxB_S32(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB_S32(coefB, in2));
204 #else
205         q31x4_t         out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxB(in1, coefA, q31x4_t),
206                                          MVE_CMPLX_MULT_FX_AxConjB(coefB, in2, q31x4_t));
207 #endif
208         vst1q(pOut1, out);
209         pOut1 += 4;
210 
211         offsetCoef += modifier * 4;
212         offset -= 4;
213 
214         pIn1 += 4;
215         i -= 1;
216     }
217 
218     pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
219     pDst[2 * fftLen + 1] = 0;
220 
221     pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
222     pDst[1] = 0;
223 }
224 #else
arm_split_rfft_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pATable,const q31_t * pBTable,q31_t * pDst,uint32_t modifier)225 ARM_DSP_ATTRIBUTE void arm_split_rfft_q31(
226         q31_t * pSrc,
227         uint32_t fftLen,
228   const q31_t * pATable,
229   const q31_t * pBTable,
230         q31_t * pDst,
231         uint32_t modifier)
232 {
233         uint32_t i;                                    /* Loop Counter */
234         q31_t outR, outI;                              /* Temporary variables for output */
235   const q31_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
236         q31_t CoefA1, CoefA2, CoefB1;                  /* Temporary variables for twiddle coefficients */
237         q31_t *pOut1 = &pDst[2], *pOut2 = &pDst[4 * fftLen - 1];
238         q31_t *pIn1 =  &pSrc[2], *pIn2 =  &pSrc[2 * fftLen - 1];
239 
240   /* Init coefficient pointers */
241   pCoefA = &pATable[modifier * 2];
242   pCoefB = &pBTable[modifier * 2];
243 
244   i = fftLen - 1U;
245 
246   while (i > 0U)
247   {
248      /*
249        outR = (  pSrc[2 * i]             * pATable[2 * i]
250                - pSrc[2 * i + 1]         * pATable[2 * i + 1]
251                + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
252                + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
253 
254        outI = (  pIn[2 * i + 1]         * pATable[2 * i]
255                + pIn[2 * i]             * pATable[2 * i + 1]
256                + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
257                - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
258       */
259 
260      CoefA1 = *pCoefA++;
261      CoefA2 = *pCoefA;
262 
263      /* outR = (pSrc[2 * i] * pATable[2 * i] */
264      mult_32x32_keep32_R (outR, *pIn1, CoefA1);
265 
266      /* outI = pIn[2 * i] * pATable[2 * i + 1] */
267      mult_32x32_keep32_R (outI, *pIn1++, CoefA2);
268 
269      /* - pSrc[2 * i + 1] * pATable[2 * i + 1] */
270      multSub_32x32_keep32_R (outR, *pIn1, CoefA2);
271 
272      /* (pIn[2 * i + 1] * pATable[2 * i] */
273      multAcc_32x32_keep32_R (outI, *pIn1++, CoefA1);
274 
275      /* pSrc[2 * n - 2 * i] * pBTable[2 * i]  */
276      multSub_32x32_keep32_R (outR, *pIn2, CoefA2);
277      CoefB1 = *pCoefB;
278 
279      /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
280      multSub_32x32_keep32_R (outI, *pIn2--, CoefB1);
281 
282      /* pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
283      multAcc_32x32_keep32_R (outR, *pIn2, CoefB1);
284 
285      /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
286      multSub_32x32_keep32_R (outI, *pIn2--, CoefA2);
287 
288      /* write output */
289      *pOut1++ = outR;
290      *pOut1++ = outI;
291 
292      /* write complex conjugate output */
293      *pOut2-- = -outI;
294      *pOut2-- = outR;
295 
296      /* update coefficient pointer */
297      pCoefB = pCoefB + (2 * modifier);
298      pCoefA = pCoefA + (2 * modifier - 1);
299 
300      /* Decrement loop count */
301      i--;
302   }
303 
304   pDst[2 * fftLen]     = (pSrc[0] - pSrc[1]) >> 1U;
305   pDst[2 * fftLen + 1] = 0;
306 
307   pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
308   pDst[1] = 0;
309 }
310 #endif /* defined(ARM_MATH_MVEI) */
311 
312 /**
313   @brief         Core Real IFFT process
314   @param[in]     pSrc      points to input buffer
315   @param[in]     fftLen    length of FFT
316   @param[in]     pATable   points to twiddle Coef A buffer
317   @param[in]     pBTable   points to twiddle Coef B buffer
318   @param[out]    pDst      points to output buffer
319   @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
320  */
321 
322 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
323 
arm_split_rifft_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pATable,const q31_t * pBTable,q31_t * pDst,uint32_t modifier)324 ARM_DSP_ATTRIBUTE void arm_split_rifft_q31(
325         q31_t * pSrc,
326         uint32_t fftLen,
327   const q31_t * pATable,
328   const q31_t * pBTable,
329         q31_t * pDst,
330         uint32_t modifier)
331 {
332     uint32_t        i;          /* Loop Counter */
333     const q31_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
334     q31_t          *pIn1;
335     uint32x4_t      offset = { 2, 3, 0, 1 };
336     uint32x4_t      offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 };
337     int32x4_t       conj = { 1, -1, 1, -1 };
338 
339     offset = offset + (2 * fftLen - 2);
340 
341     /* Init coefficient pointers */
342     pCoefA = &pATable[0];
343     pCoefB = &pBTable[0];
344 
345     const q31_t    *pCoefAb, *pCoefBb;
346     pCoefAb = pCoefA;
347     pCoefBb = pCoefB;
348 
349     pIn1 = &pSrc[0];
350 
351     i = fftLen;
352     i = i >> 1;
353     while (i > 0U) {
354         q31x4_t         in1 = vld1q_s32(pIn1);
355         q31x4_t         in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset);
356         q31x4_t         coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef);
357         q31x4_t         coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef);
358 
359         /* can we avoid the conjugate here ? */
360 #if defined(ARM_DSP_BUILT_WITH_GCC)
361         q31x4_t         out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxConjB_S32(in1, coefA),
362                                      vmulq_s32(conj, MVE_CMPLX_MULT_FX_AxB_S32(in2, coefB)));
363 #else
364         q31x4_t         out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA, q31x4_t),
365                                          vmulq_s32(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB, q31x4_t)));
366 #endif
367         vst1q_s32(pDst, out);
368         pDst += 4;
369 
370         offsetCoef += modifier * 4;
371         offset -= 4;
372 
373         pIn1 += 4;
374         i -= 1;
375     }
376 }
377 #else
arm_split_rifft_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pATable,const q31_t * pBTable,q31_t * pDst,uint32_t modifier)378 ARM_DSP_ATTRIBUTE void arm_split_rifft_q31(
379         q31_t * pSrc,
380         uint32_t fftLen,
381   const q31_t * pATable,
382   const q31_t * pBTable,
383         q31_t * pDst,
384         uint32_t modifier)
385 {
386         q31_t outR, outI;                              /* Temporary variables for output */
387   const q31_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
388         q31_t CoefA1, CoefA2, CoefB1;                  /* Temporary variables for twiddle coefficients */
389         q31_t *pIn1 = &pSrc[0], *pIn2 = &pSrc[2 * fftLen + 1];
390 
391   pCoefA = &pATable[0];
392   pCoefB = &pBTable[0];
393 
394   while (fftLen > 0U)
395   {
396      /*
397        outR = (  pIn[2 * i]             * pATable[2 * i]
398                + pIn[2 * i + 1]         * pATable[2 * i + 1]
399                + pIn[2 * n - 2 * i]     * pBTable[2 * i]
400                - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
401 
402        outI = (  pIn[2 * i + 1]         * pATable[2 * i]
403                - pIn[2 * i]             * pATable[2 * i + 1]
404                - pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
405                - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
406       */
407 
408      CoefA1 = *pCoefA++;
409      CoefA2 = *pCoefA;
410 
411      /* outR = (pIn[2 * i] * pATable[2 * i] */
412      mult_32x32_keep32_R (outR, *pIn1, CoefA1);
413 
414      /* - pIn[2 * i] * pATable[2 * i + 1] */
415      mult_32x32_keep32_R (outI, *pIn1++, -CoefA2);
416 
417      /* pIn[2 * i + 1] * pATable[2 * i + 1] */
418      multAcc_32x32_keep32_R (outR, *pIn1, CoefA2);
419 
420      /* pIn[2 * i + 1] * pATable[2 * i] */
421      multAcc_32x32_keep32_R (outI, *pIn1++, CoefA1);
422 
423      /* pIn[2 * n - 2 * i] * pBTable[2 * i] */
424      multAcc_32x32_keep32_R (outR, *pIn2, CoefA2);
425      CoefB1 = *pCoefB;
426 
427      /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
428      multSub_32x32_keep32_R (outI, *pIn2--, CoefB1);
429 
430      /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
431      multAcc_32x32_keep32_R (outR, *pIn2, CoefB1);
432 
433      /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
434      multAcc_32x32_keep32_R (outI, *pIn2--, CoefA2);
435 
436      /* write output */
437      *pDst++ = outR;
438      *pDst++ = outI;
439 
440      /* update coefficient pointer */
441      pCoefB = pCoefB + (modifier * 2);
442      pCoefA = pCoefA + (modifier * 2 - 1);
443 
444      /* Decrement loop count */
445      fftLen--;
446   }
447 
448 }
449 
450 #endif /* defined(ARM_MATH_MVEI) */
451