1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_rfft_q15.c
4  * Description:  RFFT & RIFFT Q15 process function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions.h"
30 
31 /* ----------------------------------------------------------------------
32  * Internal functions prototypes
33  * -------------------------------------------------------------------- */
34 
35 void arm_split_rfft_q15(
36         q15_t * pSrc,
37         uint32_t fftLen,
38   const q15_t * pATable,
39   const q15_t * pBTable,
40         q15_t * pDst,
41         uint32_t modifier);
42 
43 void arm_split_rifft_q15(
44         q15_t * pSrc,
45         uint32_t fftLen,
46   const q15_t * pATable,
47   const q15_t * pBTable,
48         q15_t * pDst,
49         uint32_t modifier);
50 
51 /**
52   @addtogroup RealFFT
53   @{
54  */
55 
56 /**
57   @brief         Processing function for the Q15 RFFT/RIFFT.
58   @param[in]     S     points to an instance of the Q15 RFFT/RIFFT structure
59   @param[in]     pSrc  points to input buffer (Source buffer is modified by this function.)
60   @param[out]    pDst  points to output buffer
61   @return        none
62 
63   @par           Input an output formats
64                    Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
65                    Hence the output format is different for different RFFT sizes.
66                    The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
67   @par
68                    \image html RFFTQ15.gif "Input and Output Formats for Q15 RFFT"
69   @par
70                    \image html RIFFTQ15.gif "Input and Output Formats for Q15 RIFFT"
71   @par
72                    If the input buffer is of length N, the output buffer must have length 2*N.
73                    The input buffer is modified by this function.
74   @par
75                    For the RIFFT, the source buffer must at least have length
76                    fftLenReal + 2.
77                    The last two elements must be equal to what would be generated
78                    by the RFFT:
79                      (pSrc[0] - pSrc[1]) >> 1 and 0
80  */
81 
arm_rfft_q15(const arm_rfft_instance_q15 * S,q15_t * pSrc,q15_t * pDst)82 void arm_rfft_q15(
83   const arm_rfft_instance_q15 * S,
84         q15_t * pSrc,
85         q15_t * pDst)
86 {
87 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
88   const arm_cfft_instance_q15 *S_CFFT = &(S->cfftInst);
89 #else
90   const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
91 #endif
92         uint32_t L2 = S->fftLenReal >> 1U;
93 
94   /* Calculation of RIFFT of input */
95   if (S->ifftFlagR == 1U)
96   {
97      /*  Real IFFT core process */
98      arm_split_rifft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
99 
100      /* Complex IFFT process */
101      arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
102 
103      arm_shift_q15(pDst, 1, pDst, S->fftLenReal);
104   }
105   else
106   {
107      /* Calculation of RFFT of input */
108 
109      /* Complex FFT process */
110      arm_cfft_q15 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
111 
112      /*  Real FFT core process */
113      arm_split_rfft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
114   }
115 
116 }
117 
118 /**
119   @} end of RealFFT group
120  */
121 
122 /**
123   @brief         Core Real FFT process
124   @param[in]     pSrc      points to input buffer
125   @param[in]     fftLen    length of FFT
126   @param[in]     pATable   points to twiddle Coef A buffer
127   @param[in]     pBTable   points to twiddle Coef B buffer
128   @param[out]    pDst      points to output buffer
129   @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
130   @return        none
131 
132   @par
133                    The function implements a Real FFT
134  */
135 
136 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
137 
138 #include "arm_helium_utils.h"
139 #include "arm_vec_fft.h"
140 
141 #if defined(__CMSIS_GCC_H)
142 #define MVE_CMPLX_MULT_FX_AxB_S16(A,B)          vqdmladhxq_s16(vqdmlsdhq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B)
143 #define MVE_CMPLX_MULT_FX_AxConjB_S16(A,B)      vqdmladhq_s16(vqdmlsdhxq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B)
144 
145 #endif
146 
arm_split_rfft_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pATable,const q15_t * pBTable,q15_t * pDst,uint32_t modifier)147 void arm_split_rfft_q15(
148         q15_t * pSrc,
149         uint32_t fftLen,
150   const q15_t * pATable,
151   const q15_t * pBTable,
152         q15_t * pDst,
153         uint32_t modifier)
154 {
155    uint32_t        i;          /* Loop Counter */
156     const q15_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
157     q15_t          *pOut1 = &pDst[2];
158     q15_t          *pIn1 = &pSrc[2];
159     uint16x8_t      offsetIn = { 6, 7, 4, 5, 2, 3, 0, 1 };
160     uint16x8_t      offsetCoef;
161     const uint16_t  offsetCoefArr[16] = {
162         0, 0, 2, 2, 4, 4, 6, 6,
163         0, 1, 0, 1, 0, 1, 0, 1
164     };
165 
166     offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8);
167     offsetIn = vaddq_n_u16(offsetIn, (2 * fftLen - 8));
168 
169     /* Init coefficient pointers */
170     pCoefA = &pATable[modifier * 2];
171     pCoefB = &pBTable[modifier * 2];
172 
173     const q15_t    *pCoefAb, *pCoefBb;
174     pCoefAb = pCoefA;
175     pCoefBb = pCoefB;
176 
177     pIn1 = &pSrc[2];
178 
179     i = fftLen - 1U;
180     i = i / 4 + 1;
181     while (i > 0U) {
182         q15x8_t         in1 = vld1q_s16(pIn1);
183         q15x8_t         in2 = vldrhq_gather_shifted_offset_s16(pSrc, offsetIn);
184         q15x8_t         coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
185         q15x8_t         coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
186 
187 #if defined(__CMSIS_GCC_H)
188         q15x8_t         out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB_S16(in1, coefA),
189                                      MVE_CMPLX_MULT_FX_AxConjB_S16(coefB, in2));
190 #else
191         q15x8_t         out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB(in1, coefA),
192                                      MVE_CMPLX_MULT_FX_AxConjB(coefB, in2));
193 #endif
194         vst1q_s16(pOut1, out);
195         pOut1 += 8;
196 
197         offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
198         offsetIn -= 8;
199         pIn1 += 8;
200         i -= 1;
201     }
202 
203     pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
204     pDst[2 * fftLen + 1] = 0;
205 
206     pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
207     pDst[1] = 0;
208 }
209 #else
arm_split_rfft_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pATable,const q15_t * pBTable,q15_t * pDst,uint32_t modifier)210 void arm_split_rfft_q15(
211         q15_t * pSrc,
212         uint32_t fftLen,
213   const q15_t * pATable,
214   const q15_t * pBTable,
215         q15_t * pDst,
216         uint32_t modifier)
217 {
218         uint32_t i;                                    /* Loop Counter */
219         q31_t outR, outI;                              /* Temporary variables for output */
220   const q15_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
221         q15_t *pSrc1, *pSrc2;
222 #if defined (ARM_MATH_DSP)
223         q15_t *pD1, *pD2;
224 #endif
225 
226   /* Init coefficient pointers */
227   pCoefA = &pATable[modifier * 2];
228   pCoefB = &pBTable[modifier * 2];
229 
230   pSrc1 = &pSrc[2];
231   pSrc2 = &pSrc[(2U * fftLen) - 2U];
232 
233 #if defined (ARM_MATH_DSP)
234 
235     i = 1U;
236     pD1 = pDst + 2;
237     pD2 = pDst + (4U * fftLen) - 2;
238 
239     for (i = fftLen - 1; i > 0; i--)
240     {
241         /*
242           outR = (  pSrc[2 * i]             * pATable[2 * i]
243                   - pSrc[2 * i + 1]         * pATable[2 * i + 1]
244                   + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
245                   + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
246 
247           outI = (  pIn[2 * i + 1]         * pATable[2 * i]
248                   + pIn[2 * i]             * pATable[2 * i + 1]
249                   + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
250                   - pIn[2 * n - 2 * i + 1] * pBTable[2 * i])
251          */
252 
253 
254 #ifndef ARM_MATH_BIG_ENDIAN
255         /* pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1] */
256         outR = __SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA));
257 #else
258         /* -(pSrc[2 * i + 1] * pATable[2 * i + 1] - pSrc[2 * i] * pATable[2 * i]) */
259         outR = -(__SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA)));
260 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
261 
262         /* pSrc[2 * n - 2 * i] * pBTable[2 * i] + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
263         outR = __SMLAD(read_q15x2 (pSrc2), read_q15x2((q15_t *) pCoefB), outR) >> 16U;
264 
265         /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
266 #ifndef ARM_MATH_BIG_ENDIAN
267         outI = __SMUSDX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
268 #else
269         outI = __SMUSDX(read_q15x2 ((q15_t *) pCoefB), read_q15x2_da (&pSrc2));
270 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
271 
272         /* (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] */
273         outI = __SMLADX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), outI);
274 
275         /* write output */
276         *pD1++ = (q15_t) outR;
277         *pD1++ = outI >> 16U;
278 
279         /* write complex conjugate output */
280         pD2[0] = (q15_t) outR;
281         pD2[1] = -(outI >> 16U);
282         pD2 -= 2;
283 
284         /* update coefficient pointer */
285         pCoefB = pCoefB + (2U * modifier);
286         pCoefA = pCoefA + (2U * modifier);
287     }
288 
289     pDst[2U * fftLen]      = (pSrc[0] - pSrc[1]) >> 1U;
290     pDst[2U * fftLen + 1U] = 0;
291 
292     pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
293     pDst[1] = 0;
294 
295 #else
296 
297     i = 1U;
298 
299     while (i < fftLen)
300     {
301         /*
302           outR = (  pSrc[2 * i]             * pATable[2 * i]
303                   - pSrc[2 * i + 1]         * pATable[2 * i + 1]
304                   + pSrc[2 * n - 2 * i]     * pBTable[2 * i]
305                   + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
306         */
307 
308         outR = *pSrc1 * *pCoefA;
309         outR = outR - (*(pSrc1 + 1) * *(pCoefA + 1));
310         outR = outR + (*pSrc2 * *pCoefB);
311         outR = (outR + (*(pSrc2 + 1) * *(pCoefB + 1))) >> 16;
312 
313         /*
314           outI = (  pIn[2 * i + 1]         * pATable[2 * i]
315                   + pIn[2 * i]             * pATable[2 * i + 1]
316                   + pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
317                   - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
318         */
319 
320         outI = *pSrc2 * *(pCoefB + 1);
321         outI = outI - (*(pSrc2 + 1) * *pCoefB);
322         outI = outI + (*(pSrc1 + 1) * *pCoefA);
323         outI = outI + (*pSrc1 * *(pCoefA + 1));
324 
325         /* update input pointers */
326         pSrc1 += 2U;
327         pSrc2 -= 2U;
328 
329         /* write output */
330         pDst[2U * i] = (q15_t) outR;
331         pDst[2U * i + 1U] = outI >> 16U;
332 
333         /* write complex conjugate output */
334         pDst[(4U * fftLen) - (2U * i)] = (q15_t) outR;
335         pDst[((4U * fftLen) - (2U * i)) + 1U] = -(outI >> 16U);
336 
337         /* update coefficient pointer */
338         pCoefB = pCoefB + (2U * modifier);
339         pCoefA = pCoefA + (2U * modifier);
340 
341         i++;
342     }
343 
344     pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
345     pDst[2U * fftLen + 1U] = 0;
346 
347     pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
348     pDst[1] = 0;
349 
350 #endif /* #if defined (ARM_MATH_DSP) */
351 }
352 #endif /* defined(ARM_MATH_MVEI) */
353 
354 /**
355   @brief         Core Real IFFT process
356   @param[in]     pSrc      points to input buffer
357   @param[in]     fftLen    length of FFT
358   @param[in]     pATable   points to twiddle Coef A buffer
359   @param[in]     pBTable   points to twiddle Coef B buffer
360   @param[out]    pDst      points to output buffer
361   @param[in]     modifier  twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
362   @return        none
363 
364   @par
365                    The function implements a Real IFFT
366  */
367 
368 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
369 
370 #include "arm_helium_utils.h"
371 #include "arm_vec_fft.h"
372 
arm_split_rifft_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pATable,const q15_t * pBTable,q15_t * pDst,uint32_t modifier)373 void arm_split_rifft_q15(
374         q15_t * pSrc,
375         uint32_t fftLen,
376   const q15_t * pATable,
377   const q15_t * pBTable,
378         q15_t * pDst,
379         uint32_t modifier)
380 {
381    uint32_t        i;                  /* Loop Counter */
382     const q15_t    *pCoefA, *pCoefB;    /* Temporary pointers for twiddle factors */
383     q15_t          *pIn1;
384     uint16x8_t      offset = { 6, 7, 4, 5, 2, 3, 0, 1 };
385     uint16x8_t      offsetCoef;
386     int16x8_t       conj = { 1, -1, 1, -1, 1, -1, 1, -1 }; /* conjugate */
387     const uint16_t  offsetCoefArr[16] = {
388         0, 0, 2, 2, 4, 4, 6, 6,
389         0, 1, 0, 1, 0, 1, 0, 1
390     };
391 
392     offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8);
393 
394     offset = vaddq_n_u16(offset, (2 * fftLen - 6));
395 
396     /* Init coefficient pointers */
397     pCoefA = &pATable[0];
398     pCoefB = &pBTable[0];
399 
400     const q15_t    *pCoefAb, *pCoefBb;
401     pCoefAb = pCoefA;
402     pCoefBb = pCoefB;
403 
404     pIn1 = &pSrc[0];
405 
406     i = fftLen;
407     i = i / 4;
408 
409     while (i > 0U) {
410         q15x8_t         in1 = vld1q_s16(pIn1);
411         q15x8_t         in2 = vldrhq_gather_shifted_offset_s16(pSrc, offset);
412         q15x8_t         coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
413         q15x8_t         coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
414 
415         /* can we avoid the conjugate here ? */
416         q15x8_t         out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA),
417                                      vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB)));
418 
419         vst1q_s16(pDst, out);
420         pDst += 8;
421 
422         offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
423         offset -= 8;
424 
425         pIn1 += 8;
426         i -= 1;
427     }
428 }
429 #else
arm_split_rifft_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pATable,const q15_t * pBTable,q15_t * pDst,uint32_t modifier)430 void arm_split_rifft_q15(
431         q15_t * pSrc,
432         uint32_t fftLen,
433   const q15_t * pATable,
434   const q15_t * pBTable,
435         q15_t * pDst,
436         uint32_t modifier)
437 {
438         uint32_t i;                                    /* Loop Counter */
439         q31_t outR, outI;                              /* Temporary variables for output */
440   const q15_t *pCoefA, *pCoefB;                        /* Temporary pointers for twiddle factors */
441         q15_t *pSrc1, *pSrc2;
442         q15_t *pDst1 = &pDst[0];
443 
444   pCoefA = &pATable[0];
445   pCoefB = &pBTable[0];
446 
447   pSrc1 = &pSrc[0];
448   pSrc2 = &pSrc[2 * fftLen];
449 
450   i = fftLen;
451   while (i > 0U)
452   {
453       /*
454         outR = (  pIn[2 * i]             * pATable[2 * i]
455                 + pIn[2 * i + 1]         * pATable[2 * i + 1]
456                 + pIn[2 * n - 2 * i]     * pBTable[2 * i]
457                 - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
458 
459         outI = (  pIn[2 * i + 1]         * pATable[2 * i]
460                 - pIn[2 * i]             * pATable[2 * i + 1]
461                 - pIn[2 * n - 2 * i]     * pBTable[2 * i + 1]
462                 - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
463        */
464 
465 #if defined (ARM_MATH_DSP)
466 
467 #ifndef ARM_MATH_BIG_ENDIAN
468       /* pIn[2 * n - 2 * i] * pBTable[2 * i] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
469       outR = __SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB));
470 #else
471       /* -(-pIn[2 * n - 2 * i] * pBTable[2 * i] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1])) */
472       outR = -(__SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB)));
473 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
474 
475       /* pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] + pIn[2 * n - 2 * i] * pBTable[2 * i] */
476       outR = __SMLAD(read_q15x2(pSrc1), read_q15x2 ((q15_t *) pCoefA), outR) >> 16U;
477 
478       /* -pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
479       outI = __SMUADX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
480 
481       /* pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] */
482 #ifndef ARM_MATH_BIG_ENDIAN
483       outI = __SMLSDX(read_q15x2 ((q15_t *) pCoefA), read_q15x2_ia (&pSrc1), -outI);
484 #else
485       outI = __SMLSDX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), -outI);
486 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
487 
488       /* write output */
489 #ifndef ARM_MATH_BIG_ENDIAN
490       write_q15x2_ia (&pDst1, __PKHBT(outR, (outI >> 16U), 16));
491 #else
492       write_q15x2_ia (&pDst1, __PKHBT((outI >> 16U), outR, 16));
493 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
494 
495 
496 #else  /* #if defined (ARM_MATH_DSP) */
497 
498       outR = *pSrc2 * *pCoefB;
499       outR = outR - (*(pSrc2 + 1) * *(pCoefB + 1));
500       outR = outR + (*pSrc1 * *pCoefA);
501       outR = (outR + (*(pSrc1 + 1) * *(pCoefA + 1))) >> 16;
502 
503       outI = *(pSrc1 + 1) * *pCoefA;
504       outI = outI - (*pSrc1 * *(pCoefA + 1));
505       outI = outI - (*pSrc2 * *(pCoefB + 1));
506       outI = outI - (*(pSrc2 + 1) * *(pCoefB));
507 
508       /* update input pointers */
509       pSrc1 += 2U;
510       pSrc2 -= 2U;
511 
512       /* write output */
513       *pDst1++ = (q15_t) outR;
514       *pDst1++ = (q15_t) (outI >> 16);
515 
516 #endif /* #if defined (ARM_MATH_DSP) */
517 
518       /* update coefficient pointer */
519       pCoefB = pCoefB + (2 * modifier);
520       pCoefA = pCoefA + (2 * modifier);
521 
522       i--;
523   }
524 
525 }
526 #endif /* defined(ARM_MATH_MVEI) */
527