1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_rfft_q15.c
4 * Description: RFFT & RIFFT Q15 process function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions.h"
30
31 /* ----------------------------------------------------------------------
32 * Internal functions prototypes
33 * -------------------------------------------------------------------- */
34
35 void arm_split_rfft_q15(
36 q15_t * pSrc,
37 uint32_t fftLen,
38 const q15_t * pATable,
39 const q15_t * pBTable,
40 q15_t * pDst,
41 uint32_t modifier);
42
43 void arm_split_rifft_q15(
44 q15_t * pSrc,
45 uint32_t fftLen,
46 const q15_t * pATable,
47 const q15_t * pBTable,
48 q15_t * pDst,
49 uint32_t modifier);
50
51 /**
52 @addtogroup RealFFT
53 @{
54 */
55
56 /**
57 @brief Processing function for the Q15 RFFT/RIFFT.
58 @param[in] S points to an instance of the Q15 RFFT/RIFFT structure
59 @param[in] pSrc points to input buffer (Source buffer is modified by this function.)
60 @param[out] pDst points to output buffer
61 @return none
62
63 @par Input an output formats
64 Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
65 Hence the output format is different for different RFFT sizes.
66 The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
67 @par
68 \image html RFFTQ15.gif "Input and Output Formats for Q15 RFFT"
69 @par
70 \image html RIFFTQ15.gif "Input and Output Formats for Q15 RIFFT"
71 @par
72 If the input buffer is of length N, the output buffer must have length 2*N.
73 The input buffer is modified by this function.
74 @par
75 For the RIFFT, the source buffer must at least have length
76 fftLenReal + 2.
77 The last two elements must be equal to what would be generated
78 by the RFFT:
79 (pSrc[0] - pSrc[1]) >> 1 and 0
80 */
81
arm_rfft_q15(const arm_rfft_instance_q15 * S,q15_t * pSrc,q15_t * pDst)82 void arm_rfft_q15(
83 const arm_rfft_instance_q15 * S,
84 q15_t * pSrc,
85 q15_t * pDst)
86 {
87 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
88 const arm_cfft_instance_q15 *S_CFFT = &(S->cfftInst);
89 #else
90 const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
91 #endif
92 uint32_t L2 = S->fftLenReal >> 1U;
93
94 /* Calculation of RIFFT of input */
95 if (S->ifftFlagR == 1U)
96 {
97 /* Real IFFT core process */
98 arm_split_rifft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
99
100 /* Complex IFFT process */
101 arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
102
103 arm_shift_q15(pDst, 1, pDst, S->fftLenReal);
104 }
105 else
106 {
107 /* Calculation of RFFT of input */
108
109 /* Complex FFT process */
110 arm_cfft_q15 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
111
112 /* Real FFT core process */
113 arm_split_rfft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
114 }
115
116 }
117
118 /**
119 @} end of RealFFT group
120 */
121
122 /**
123 @brief Core Real FFT process
124 @param[in] pSrc points to input buffer
125 @param[in] fftLen length of FFT
126 @param[in] pATable points to twiddle Coef A buffer
127 @param[in] pBTable points to twiddle Coef B buffer
128 @param[out] pDst points to output buffer
129 @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
130 @return none
131
132 @par
133 The function implements a Real FFT
134 */
135
136 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
137
138 #include "arm_helium_utils.h"
139 #include "arm_vec_fft.h"
140
141 #if defined(__CMSIS_GCC_H)
142 #define MVE_CMPLX_MULT_FX_AxB_S16(A,B) vqdmladhxq_s16(vqdmlsdhq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B)
143 #define MVE_CMPLX_MULT_FX_AxConjB_S16(A,B) vqdmladhq_s16(vqdmlsdhxq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B)
144
145 #endif
146
arm_split_rfft_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pATable,const q15_t * pBTable,q15_t * pDst,uint32_t modifier)147 void arm_split_rfft_q15(
148 q15_t * pSrc,
149 uint32_t fftLen,
150 const q15_t * pATable,
151 const q15_t * pBTable,
152 q15_t * pDst,
153 uint32_t modifier)
154 {
155 uint32_t i; /* Loop Counter */
156 const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
157 q15_t *pOut1 = &pDst[2];
158 q15_t *pIn1 = &pSrc[2];
159 uint16x8_t offsetIn = { 6, 7, 4, 5, 2, 3, 0, 1 };
160 uint16x8_t offsetCoef;
161 const uint16_t offsetCoefArr[16] = {
162 0, 0, 2, 2, 4, 4, 6, 6,
163 0, 1, 0, 1, 0, 1, 0, 1
164 };
165
166 offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8);
167 offsetIn = vaddq_n_u16(offsetIn, (2 * fftLen - 8));
168
169 /* Init coefficient pointers */
170 pCoefA = &pATable[modifier * 2];
171 pCoefB = &pBTable[modifier * 2];
172
173 const q15_t *pCoefAb, *pCoefBb;
174 pCoefAb = pCoefA;
175 pCoefBb = pCoefB;
176
177 pIn1 = &pSrc[2];
178
179 i = fftLen - 1U;
180 i = i / 4 + 1;
181 while (i > 0U) {
182 q15x8_t in1 = vld1q_s16(pIn1);
183 q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offsetIn);
184 q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
185 q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
186
187 #if defined(__CMSIS_GCC_H)
188 q15x8_t out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB_S16(in1, coefA),
189 MVE_CMPLX_MULT_FX_AxConjB_S16(coefB, in2));
190 #else
191 q15x8_t out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB(in1, coefA),
192 MVE_CMPLX_MULT_FX_AxConjB(coefB, in2));
193 #endif
194 vst1q_s16(pOut1, out);
195 pOut1 += 8;
196
197 offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
198 offsetIn -= 8;
199 pIn1 += 8;
200 i -= 1;
201 }
202
203 pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
204 pDst[2 * fftLen + 1] = 0;
205
206 pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
207 pDst[1] = 0;
208 }
209 #else
arm_split_rfft_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pATable,const q15_t * pBTable,q15_t * pDst,uint32_t modifier)210 void arm_split_rfft_q15(
211 q15_t * pSrc,
212 uint32_t fftLen,
213 const q15_t * pATable,
214 const q15_t * pBTable,
215 q15_t * pDst,
216 uint32_t modifier)
217 {
218 uint32_t i; /* Loop Counter */
219 q31_t outR, outI; /* Temporary variables for output */
220 const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
221 q15_t *pSrc1, *pSrc2;
222 #if defined (ARM_MATH_DSP)
223 q15_t *pD1, *pD2;
224 #endif
225
226 /* Init coefficient pointers */
227 pCoefA = &pATable[modifier * 2];
228 pCoefB = &pBTable[modifier * 2];
229
230 pSrc1 = &pSrc[2];
231 pSrc2 = &pSrc[(2U * fftLen) - 2U];
232
233 #if defined (ARM_MATH_DSP)
234
235 i = 1U;
236 pD1 = pDst + 2;
237 pD2 = pDst + (4U * fftLen) - 2;
238
239 for (i = fftLen - 1; i > 0; i--)
240 {
241 /*
242 outR = ( pSrc[2 * i] * pATable[2 * i]
243 - pSrc[2 * i + 1] * pATable[2 * i + 1]
244 + pSrc[2 * n - 2 * i] * pBTable[2 * i]
245 + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
246
247 outI = ( pIn[2 * i + 1] * pATable[2 * i]
248 + pIn[2 * i] * pATable[2 * i + 1]
249 + pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
250 - pIn[2 * n - 2 * i + 1] * pBTable[2 * i])
251 */
252
253
254 #ifndef ARM_MATH_BIG_ENDIAN
255 /* pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1] */
256 outR = __SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA));
257 #else
258 /* -(pSrc[2 * i + 1] * pATable[2 * i + 1] - pSrc[2 * i] * pATable[2 * i]) */
259 outR = -(__SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA)));
260 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
261
262 /* pSrc[2 * n - 2 * i] * pBTable[2 * i] + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
263 outR = __SMLAD(read_q15x2 (pSrc2), read_q15x2((q15_t *) pCoefB), outR) >> 16U;
264
265 /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
266 #ifndef ARM_MATH_BIG_ENDIAN
267 outI = __SMUSDX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
268 #else
269 outI = __SMUSDX(read_q15x2 ((q15_t *) pCoefB), read_q15x2_da (&pSrc2));
270 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
271
272 /* (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] */
273 outI = __SMLADX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), outI);
274
275 /* write output */
276 *pD1++ = (q15_t) outR;
277 *pD1++ = outI >> 16U;
278
279 /* write complex conjugate output */
280 pD2[0] = (q15_t) outR;
281 pD2[1] = -(outI >> 16U);
282 pD2 -= 2;
283
284 /* update coefficient pointer */
285 pCoefB = pCoefB + (2U * modifier);
286 pCoefA = pCoefA + (2U * modifier);
287 }
288
289 pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
290 pDst[2U * fftLen + 1U] = 0;
291
292 pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
293 pDst[1] = 0;
294
295 #else
296
297 i = 1U;
298
299 while (i < fftLen)
300 {
301 /*
302 outR = ( pSrc[2 * i] * pATable[2 * i]
303 - pSrc[2 * i + 1] * pATable[2 * i + 1]
304 + pSrc[2 * n - 2 * i] * pBTable[2 * i]
305 + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
306 */
307
308 outR = *pSrc1 * *pCoefA;
309 outR = outR - (*(pSrc1 + 1) * *(pCoefA + 1));
310 outR = outR + (*pSrc2 * *pCoefB);
311 outR = (outR + (*(pSrc2 + 1) * *(pCoefB + 1))) >> 16;
312
313 /*
314 outI = ( pIn[2 * i + 1] * pATable[2 * i]
315 + pIn[2 * i] * pATable[2 * i + 1]
316 + pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
317 - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
318 */
319
320 outI = *pSrc2 * *(pCoefB + 1);
321 outI = outI - (*(pSrc2 + 1) * *pCoefB);
322 outI = outI + (*(pSrc1 + 1) * *pCoefA);
323 outI = outI + (*pSrc1 * *(pCoefA + 1));
324
325 /* update input pointers */
326 pSrc1 += 2U;
327 pSrc2 -= 2U;
328
329 /* write output */
330 pDst[2U * i] = (q15_t) outR;
331 pDst[2U * i + 1U] = outI >> 16U;
332
333 /* write complex conjugate output */
334 pDst[(4U * fftLen) - (2U * i)] = (q15_t) outR;
335 pDst[((4U * fftLen) - (2U * i)) + 1U] = -(outI >> 16U);
336
337 /* update coefficient pointer */
338 pCoefB = pCoefB + (2U * modifier);
339 pCoefA = pCoefA + (2U * modifier);
340
341 i++;
342 }
343
344 pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
345 pDst[2U * fftLen + 1U] = 0;
346
347 pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
348 pDst[1] = 0;
349
350 #endif /* #if defined (ARM_MATH_DSP) */
351 }
352 #endif /* defined(ARM_MATH_MVEI) */
353
354 /**
355 @brief Core Real IFFT process
356 @param[in] pSrc points to input buffer
357 @param[in] fftLen length of FFT
358 @param[in] pATable points to twiddle Coef A buffer
359 @param[in] pBTable points to twiddle Coef B buffer
360 @param[out] pDst points to output buffer
361 @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
362 @return none
363
364 @par
365 The function implements a Real IFFT
366 */
367
368 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
369
370 #include "arm_helium_utils.h"
371 #include "arm_vec_fft.h"
372
arm_split_rifft_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pATable,const q15_t * pBTable,q15_t * pDst,uint32_t modifier)373 void arm_split_rifft_q15(
374 q15_t * pSrc,
375 uint32_t fftLen,
376 const q15_t * pATable,
377 const q15_t * pBTable,
378 q15_t * pDst,
379 uint32_t modifier)
380 {
381 uint32_t i; /* Loop Counter */
382 const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
383 q15_t *pIn1;
384 uint16x8_t offset = { 6, 7, 4, 5, 2, 3, 0, 1 };
385 uint16x8_t offsetCoef;
386 int16x8_t conj = { 1, -1, 1, -1, 1, -1, 1, -1 }; /* conjugate */
387 const uint16_t offsetCoefArr[16] = {
388 0, 0, 2, 2, 4, 4, 6, 6,
389 0, 1, 0, 1, 0, 1, 0, 1
390 };
391
392 offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8);
393
394 offset = vaddq_n_u16(offset, (2 * fftLen - 6));
395
396 /* Init coefficient pointers */
397 pCoefA = &pATable[0];
398 pCoefB = &pBTable[0];
399
400 const q15_t *pCoefAb, *pCoefBb;
401 pCoefAb = pCoefA;
402 pCoefBb = pCoefB;
403
404 pIn1 = &pSrc[0];
405
406 i = fftLen;
407 i = i / 4;
408
409 while (i > 0U) {
410 q15x8_t in1 = vld1q_s16(pIn1);
411 q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offset);
412 q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
413 q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
414
415 /* can we avoid the conjugate here ? */
416 q15x8_t out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA),
417 vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB)));
418
419 vst1q_s16(pDst, out);
420 pDst += 8;
421
422 offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
423 offset -= 8;
424
425 pIn1 += 8;
426 i -= 1;
427 }
428 }
429 #else
arm_split_rifft_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pATable,const q15_t * pBTable,q15_t * pDst,uint32_t modifier)430 void arm_split_rifft_q15(
431 q15_t * pSrc,
432 uint32_t fftLen,
433 const q15_t * pATable,
434 const q15_t * pBTable,
435 q15_t * pDst,
436 uint32_t modifier)
437 {
438 uint32_t i; /* Loop Counter */
439 q31_t outR, outI; /* Temporary variables for output */
440 const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
441 q15_t *pSrc1, *pSrc2;
442 q15_t *pDst1 = &pDst[0];
443
444 pCoefA = &pATable[0];
445 pCoefB = &pBTable[0];
446
447 pSrc1 = &pSrc[0];
448 pSrc2 = &pSrc[2 * fftLen];
449
450 i = fftLen;
451 while (i > 0U)
452 {
453 /*
454 outR = ( pIn[2 * i] * pATable[2 * i]
455 + pIn[2 * i + 1] * pATable[2 * i + 1]
456 + pIn[2 * n - 2 * i] * pBTable[2 * i]
457 - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
458
459 outI = ( pIn[2 * i + 1] * pATable[2 * i]
460 - pIn[2 * i] * pATable[2 * i + 1]
461 - pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
462 - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
463 */
464
465 #if defined (ARM_MATH_DSP)
466
467 #ifndef ARM_MATH_BIG_ENDIAN
468 /* pIn[2 * n - 2 * i] * pBTable[2 * i] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
469 outR = __SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB));
470 #else
471 /* -(-pIn[2 * n - 2 * i] * pBTable[2 * i] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1])) */
472 outR = -(__SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB)));
473 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
474
475 /* pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] + pIn[2 * n - 2 * i] * pBTable[2 * i] */
476 outR = __SMLAD(read_q15x2(pSrc1), read_q15x2 ((q15_t *) pCoefA), outR) >> 16U;
477
478 /* -pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
479 outI = __SMUADX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
480
481 /* pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] */
482 #ifndef ARM_MATH_BIG_ENDIAN
483 outI = __SMLSDX(read_q15x2 ((q15_t *) pCoefA), read_q15x2_ia (&pSrc1), -outI);
484 #else
485 outI = __SMLSDX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), -outI);
486 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
487
488 /* write output */
489 #ifndef ARM_MATH_BIG_ENDIAN
490 write_q15x2_ia (&pDst1, __PKHBT(outR, (outI >> 16U), 16));
491 #else
492 write_q15x2_ia (&pDst1, __PKHBT((outI >> 16U), outR, 16));
493 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
494
495
496 #else /* #if defined (ARM_MATH_DSP) */
497
498 outR = *pSrc2 * *pCoefB;
499 outR = outR - (*(pSrc2 + 1) * *(pCoefB + 1));
500 outR = outR + (*pSrc1 * *pCoefA);
501 outR = (outR + (*(pSrc1 + 1) * *(pCoefA + 1))) >> 16;
502
503 outI = *(pSrc1 + 1) * *pCoefA;
504 outI = outI - (*pSrc1 * *(pCoefA + 1));
505 outI = outI - (*pSrc2 * *(pCoefB + 1));
506 outI = outI - (*(pSrc2 + 1) * *(pCoefB));
507
508 /* update input pointers */
509 pSrc1 += 2U;
510 pSrc2 -= 2U;
511
512 /* write output */
513 *pDst1++ = (q15_t) outR;
514 *pDst1++ = (q15_t) (outI >> 16);
515
516 #endif /* #if defined (ARM_MATH_DSP) */
517
518 /* update coefficient pointer */
519 pCoefB = pCoefB + (2 * modifier);
520 pCoefA = pCoefA + (2 * modifier);
521
522 i--;
523 }
524
525 }
526 #endif /* defined(ARM_MATH_MVEI) */
527