1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_q15.c
4  * Description:  Combined Radix Decimation in Q15 Frequency CFFT processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions.h"
30 
31 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
32 
33 #include "arm_vec_fft.h"
34 
35 
_arm_radix4_butterfly_q15_mve(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint32_t fftLen)36 static void _arm_radix4_butterfly_q15_mve(
37     const arm_cfft_instance_q15 * S,
38     q15_t   *pSrc,
39     uint32_t fftLen)
40 {
41     q15x8_t vecTmp0, vecTmp1;
42     q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
43     q15x8_t vecA, vecB, vecC, vecD;
44     q15x8_t vecW;
45     uint32_t  blkCnt;
46     uint32_t  n1, n2;
47     uint32_t  stage = 0;
48     int32_t  iter = 1;
49     static const uint32_t strides[4] = {
50         (0 - 16) * sizeof(q15_t *), (4 - 16) * sizeof(q15_t *),
51         (8 - 16) * sizeof(q15_t *), (12 - 16) * sizeof(q15_t *)
52     };
53 
54     /*
55      * Process first stages
56      * Each stage in middle stages provides two down scaling of the input
57      */
58     n2 = fftLen;
59     n1 = n2;
60     n2 >>= 2u;
61 
62     for (int k = fftLen / 4u; k > 1; k >>= 2u)
63     {
64         for (int i = 0; i < iter; i++)
65         {
66             q15_t const *p_rearranged_twiddle_tab_stride2 =
67                 &S->rearranged_twiddle_stride2[
68                 S->rearranged_twiddle_tab_stride2_arr[stage]];
69             q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
70                 S->rearranged_twiddle_tab_stride3_arr[stage]];
71             q15_t const *p_rearranged_twiddle_tab_stride1 =
72                 &S->rearranged_twiddle_stride1[
73                 S->rearranged_twiddle_tab_stride1_arr[stage]];
74             q15_t const *pW1, *pW2, *pW3;
75             q15_t    *inA = pSrc + CMPLX_DIM * i * n1;
76             q15_t    *inB = inA + n2 * CMPLX_DIM;
77             q15_t    *inC = inB + n2 * CMPLX_DIM;
78             q15_t    *inD = inC + n2 * CMPLX_DIM;
79 
80             pW1 = p_rearranged_twiddle_tab_stride1;
81             pW2 = p_rearranged_twiddle_tab_stride2;
82             pW3 = p_rearranged_twiddle_tab_stride3;
83 
84             blkCnt = n2 / 4;
85             /*
86              * load 4 x q15 complex pair
87              */
88             vecA = vldrhq_s16(inA);
89             vecC = vldrhq_s16(inC);
90             while (blkCnt > 0U)
91             {
92                 vecB = vldrhq_s16(inB);
93                 vecD = vldrhq_s16(inD);
94 
95                 vecSum0 = vhaddq(vecA, vecC);
96                 vecDiff0 = vhsubq(vecA, vecC);
97 
98                 vecSum1 = vhaddq(vecB, vecD);
99                 vecDiff1 = vhsubq(vecB, vecD);
100                 /*
101                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
102                  */
103                 vecTmp0 = vhaddq(vecSum0, vecSum1);
104                 vst1q(inA, vecTmp0);
105                 inA += 8;
106                 /*
107                  * [ 1 -1 1 -1 ] * [ A B C D ]'
108                  */
109                 vecTmp0 = vhsubq(vecSum0, vecSum1);
110                 /*
111                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
112                  */
113                 vecW = vld1q(pW2);
114                 pW2 += 8;
115                 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
116 
117                 vst1q(inB, vecTmp1);
118                 inB += 8;
119                 /*
120                  * [ 1 -i -1 +i ] * [ A B C D ]'
121                  */
122                 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
123                 /*
124                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
125                  */
126                 vecW = vld1q(pW1);
127                 pW1 += 8;
128                 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
129                 vst1q(inC, vecTmp1);
130                 inC += 8;
131 
132                 /*
133                  * [ 1 +i -1 -i ] * [ A B C D ]'
134                  */
135                 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
136                 /*
137                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
138                  */
139                 vecW = vld1q(pW3);
140                 pW3 += 8;
141                 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
142                 vst1q(inD, vecTmp1);
143                 inD += 8;
144 
145                 vecA = vldrhq_s16(inA);
146                 vecC = vldrhq_s16(inC);
147 
148                 blkCnt--;
149             }
150         }
151         n1 = n2;
152         n2 >>= 2u;
153         iter = iter << 2;
154         stage++;
155     }
156 
157     /*
158      * start of Last stage process
159      */
160     uint32x4_t vecScGathAddr = vld1q_u32 (strides);
161     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
162 
163     /*
164      * load scheduling
165      */
166     vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
167     vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
168 
169     blkCnt = (fftLen >> 4);
170     while (blkCnt > 0U)
171     {
172         vecSum0 = vhaddq(vecA, vecC);
173         vecDiff0 = vhsubq(vecA, vecC);
174 
175         vecB = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 4);
176         vecD = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 12);
177 
178         vecSum1 = vhaddq(vecB, vecD);
179         vecDiff1 = vhsubq(vecB, vecD);
180         /*
181          * pre-load for next iteration
182          */
183         vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
184         vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
185 
186         vecTmp0 = vhaddq(vecSum0, vecSum1);
187         vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
188 
189         vecTmp0 = vhsubq(vecSum0, vecSum1);
190         vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
191 
192         vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
193         vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
194 
195         vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
196         vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
197 
198         blkCnt--;
199     }
200 
201 }
202 
arm_cfft_radix4by2_q15_mve(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint32_t fftLen)203 static void arm_cfft_radix4by2_q15_mve(const arm_cfft_instance_q15 *S, q15_t *pSrc, uint32_t fftLen)
204 {
205     uint32_t n2;
206     q15_t *pIn0;
207     q15_t *pIn1;
208     const q15_t *pCoef = S->pTwiddle;
209     uint32_t     blkCnt;
210     q15x8_t    vecIn0, vecIn1, vecSum, vecDiff;
211     q15x8_t    vecCmplxTmp, vecTw;
212     q15_t  const *pCoefVec;
213 
214     n2 = fftLen >> 1;
215 
216     pIn0 = pSrc;
217     pIn1 = pSrc + fftLen;
218     pCoefVec = pCoef;
219 
220     blkCnt = n2 / 4;
221 
222     while (blkCnt > 0U)
223     {
224         vecIn0 = *(q15x8_t *) pIn0;
225         vecIn1 = *(q15x8_t *) pIn1;
226 
227         vecIn0 = vecIn0 >> 1;
228         vecIn1 = vecIn1 >> 1;
229         vecSum = vhaddq(vecIn0, vecIn1);
230         vst1q(pIn0, vecSum);
231         pIn0 += 8;
232 
233         vecTw = vld1q(pCoefVec);
234         pCoefVec += 8;
235 
236         vecDiff = vhsubq(vecIn0, vecIn1);
237         vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw);
238         vst1q(pIn1, vecCmplxTmp);
239         pIn1 += 8;
240 
241         blkCnt--;
242     }
243 
244     _arm_radix4_butterfly_q15_mve(S, pSrc, n2);
245 
246     _arm_radix4_butterfly_q15_mve(S, pSrc + fftLen, n2);
247 
248 
249     pIn0 = pSrc;
250     blkCnt = (fftLen << 1) >> 3;
251     while (blkCnt > 0U)
252     {
253         vecIn0 = *(q15x8_t *) pIn0;
254         vecIn0 = vecIn0 << 1;
255         vst1q(pIn0, vecIn0);
256         pIn0 += 8;
257         blkCnt--;
258     }
259     /*
260      * tail
261      * (will be merged thru tail predication)
262      */
263     blkCnt = (fftLen << 1) & 7;
264     if (blkCnt > 0U)
265     {
266         mve_pred16_t p0 = vctp16q(blkCnt);
267 
268         vecIn0 = *(q15x8_t *) pIn0;
269         vecIn0 = vecIn0 << 1;
270         vstrhq_p(pIn0, vecIn0, p0);
271     }
272 }
273 
_arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint32_t fftLen)274 static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S,q15_t *pSrc, uint32_t fftLen)
275 {
276     q15x8_t vecTmp0, vecTmp1;
277     q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
278     q15x8_t vecA, vecB, vecC, vecD;
279     q15x8_t vecW;
280     uint32_t  blkCnt;
281     uint32_t  n1, n2;
282     uint32_t  stage = 0;
283     int32_t  iter = 1;
284     static const uint32_t strides[4] = {
285         (0 - 16) * sizeof(q15_t *), (4 - 16) * sizeof(q15_t *),
286         (8 - 16) * sizeof(q15_t *), (12 - 16) * sizeof(q15_t *)
287     };
288 
289 
290     /*
291      * Process first stages
292      * Each stage in middle stages provides two down scaling of the input
293      */
294     n2 = fftLen;
295     n1 = n2;
296     n2 >>= 2u;
297 
298     for (int k = fftLen / 4u; k > 1; k >>= 2u)
299     {
300         for (int i = 0; i < iter; i++)
301         {
302             q15_t const *p_rearranged_twiddle_tab_stride2 =
303                 &S->rearranged_twiddle_stride2[
304                 S->rearranged_twiddle_tab_stride2_arr[stage]];
305             q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
306                 S->rearranged_twiddle_tab_stride3_arr[stage]];
307             q15_t const *p_rearranged_twiddle_tab_stride1 =
308                 &S->rearranged_twiddle_stride1[
309                 S->rearranged_twiddle_tab_stride1_arr[stage]];
310             q15_t const *pW1, *pW2, *pW3;
311             q15_t    *inA = pSrc + CMPLX_DIM * i * n1;
312             q15_t    *inB = inA + n2 * CMPLX_DIM;
313             q15_t    *inC = inB + n2 * CMPLX_DIM;
314             q15_t    *inD = inC + n2 * CMPLX_DIM;
315 
316             pW1 = p_rearranged_twiddle_tab_stride1;
317             pW2 = p_rearranged_twiddle_tab_stride2;
318             pW3 = p_rearranged_twiddle_tab_stride3;
319 
320             blkCnt = n2 / 4;
321             /*
322              * load 4 x q15 complex pair
323              */
324             vecA = vldrhq_s16(inA);
325             vecC = vldrhq_s16(inC);
326             while (blkCnt > 0U)
327             {
328                 vecB = vldrhq_s16(inB);
329                 vecD = vldrhq_s16(inD);
330 
331                 vecSum0 = vhaddq(vecA, vecC);
332                 vecDiff0 = vhsubq(vecA, vecC);
333 
334                 vecSum1 = vhaddq(vecB, vecD);
335                 vecDiff1 = vhsubq(vecB, vecD);
336                 /*
337                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
338                  */
339                 vecTmp0 = vhaddq(vecSum0, vecSum1);
340                 vst1q(inA, vecTmp0);
341                 inA += 8;
342                 /*
343                  * [ 1 -1 1 -1 ] * [ A B C D ]'
344                  */
345                 vecTmp0 = vhsubq(vecSum0, vecSum1);
346                 /*
347                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
348                  */
349                 vecW = vld1q(pW2);
350                 pW2 += 8;
351                 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
352 
353                 vst1q(inB, vecTmp1);
354                 inB += 8;
355                 /*
356                  * [ 1 -i -1 +i ] * [ A B C D ]'
357                  */
358                 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
359                 /*
360                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
361                  */
362                 vecW = vld1q(pW1);
363                 pW1 += 8;
364                 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
365                 vst1q(inC, vecTmp1);
366                 inC += 8;
367                 /*
368                  * [ 1 +i -1 -i ] * [ A B C D ]'
369                  */
370                 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
371                 /*
372                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
373                  */
374                 vecW = vld1q(pW3);
375                 pW3 += 8;
376                 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
377                 vst1q(inD, vecTmp1);
378                 inD += 8;
379 
380                 vecA = vldrhq_s16(inA);
381                 vecC = vldrhq_s16(inC);
382 
383                 blkCnt--;
384             }
385         }
386         n1 = n2;
387         n2 >>= 2u;
388         iter = iter << 2;
389         stage++;
390     }
391 
392     /*
393      * start of Last stage process
394      */
395     uint32x4_t vecScGathAddr = vld1q_u32(strides);
396     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
397 
398     /*
399      * load scheduling
400      */
401     vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
402     vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
403 
404     blkCnt = (fftLen >> 4);
405     while (blkCnt > 0U)
406     {
407         vecSum0 = vhaddq(vecA, vecC);
408         vecDiff0 = vhsubq(vecA, vecC);
409 
410         vecB = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 4);
411         vecD = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 12);
412 
413         vecSum1 = vhaddq(vecB, vecD);
414         vecDiff1 = vhsubq(vecB, vecD);
415         /*
416          * pre-load for next iteration
417          */
418         vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
419         vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
420 
421         vecTmp0 = vhaddq(vecSum0, vecSum1);
422         vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
423 
424         vecTmp0 = vhsubq(vecSum0, vecSum1);
425         vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
426 
427         vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
428         vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
429 
430         vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
431         vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
432 
433         blkCnt--;
434     }
435 }
436 
arm_cfft_radix4by2_inverse_q15_mve(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint32_t fftLen)437 static void arm_cfft_radix4by2_inverse_q15_mve(const arm_cfft_instance_q15 *S, q15_t *pSrc, uint32_t fftLen)
438 {
439     uint32_t n2;
440     q15_t *pIn0;
441     q15_t *pIn1;
442     const q15_t *pCoef = S->pTwiddle;
443 
444     uint32_t     blkCnt;
445     q15x8_t    vecIn0, vecIn1, vecSum, vecDiff;
446     q15x8_t    vecCmplxTmp, vecTw;
447     q15_t  const *pCoefVec;
448 
449     n2 = fftLen >> 1;
450 
451     pIn0 = pSrc;
452     pIn1 = pSrc + fftLen;
453     pCoefVec = pCoef;
454 
455     blkCnt = n2 / 4;
456 
457     while (blkCnt > 0U)
458     {
459         vecIn0 = *(q15x8_t *) pIn0;
460         vecIn1 = *(q15x8_t *) pIn1;
461 
462         vecIn0 = vecIn0 >> 1;
463         vecIn1 = vecIn1 >> 1;
464         vecSum = vhaddq(vecIn0, vecIn1);
465         vst1q(pIn0, vecSum);
466         pIn0 += 8;
467 
468         vecTw = vld1q(pCoefVec);
469         pCoefVec += 8;
470 
471         vecDiff = vhsubq(vecIn0, vecIn1);
472         vecCmplxTmp = vqrdmlsdhq(vuninitializedq_s16() , vecDiff, vecTw);
473         vecCmplxTmp = vqrdmladhxq(vecCmplxTmp, vecDiff, vecTw);
474         vst1q(pIn1, vecCmplxTmp);
475         pIn1 += 8;
476 
477         blkCnt--;
478     }
479 
480 
481     _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, n2);
482 
483     _arm_radix4_butterfly_inverse_q15_mve(S, pSrc + fftLen, n2);
484 
485     pIn0 = pSrc;
486     blkCnt = (fftLen << 1) >> 3;
487     while (blkCnt > 0U)
488     {
489         vecIn0 = *(q15x8_t *) pIn0;
490         vecIn0 = vecIn0 << 1;
491         vst1q(pIn0, vecIn0);
492         pIn0 += 8;
493         blkCnt--;
494     }
495     /*
496      * tail
497      * (will be merged thru tail predication)
498      */
499     blkCnt = (fftLen << 1) & 7;
500     while (blkCnt > 0U)
501     {
502         mve_pred16_t p0 = vctp16q(blkCnt);
503 
504         vecIn0 = *(q15x8_t *) pIn0;
505         vecIn0 = vecIn0 << 1;
506         vstrhq_p(pIn0, vecIn0, p0);
507     }
508 }
509 
510 /**
511   @ingroup groupTransforms
512  */
513 
514 /**
515   @addtogroup ComplexFFT
516   @{
517  */
518 
519 /**
520   @brief         Processing function for Q15 complex FFT.
521   @param[in]     S               points to an instance of Q15 CFFT structure
522   @param[in,out] p1              points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
523   @param[in]     ifftFlag       flag that selects transform direction
524                    - value = 0: forward transform
525                    - value = 1: inverse transform
526   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
527                    - value = 0: disables bit reversal of output
528                    - value = 1: enables bit reversal of output
529   @return        none
530  */
arm_cfft_q15(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)531 void arm_cfft_q15(
532   const arm_cfft_instance_q15 * S,
533         q15_t * pSrc,
534         uint8_t ifftFlag,
535         uint8_t bitReverseFlag)
536 {
537         uint32_t fftLen = S->fftLen;
538 
539         if (ifftFlag == 1U) {
540 
541             switch (fftLen) {
542             case 16:
543             case 64:
544             case 256:
545             case 1024:
546             case 4096:
547                 _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
548                 break;
549 
550             case 32:
551             case 128:
552             case 512:
553             case 2048:
554                 arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
555                 break;
556             }
557         } else {
558             switch (fftLen) {
559             case 16:
560             case 64:
561             case 256:
562             case 1024:
563             case 4096:
564                 _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
565                 break;
566 
567             case 32:
568             case 128:
569             case 512:
570             case 2048:
571                 arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
572                 break;
573             }
574         }
575 
576 
577         if (bitReverseFlag)
578         {
579 
580             arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
581 
582         }
583 }
584 
585 #else
586 
587 extern void arm_radix4_butterfly_q15(
588         q15_t * pSrc,
589         uint32_t fftLen,
590   const q15_t * pCoef,
591         uint32_t twidCoefModifier);
592 
593 extern void arm_radix4_butterfly_inverse_q15(
594         q15_t * pSrc,
595         uint32_t fftLen,
596   const q15_t * pCoef,
597         uint32_t twidCoefModifier);
598 
599 extern void arm_bitreversal_16(
600         uint16_t * pSrc,
601   const uint16_t bitRevLen,
602   const uint16_t * pBitRevTable);
603 
604 void arm_cfft_radix4by2_q15(
605         q15_t * pSrc,
606         uint32_t fftLen,
607   const q15_t * pCoef);
608 
609 void arm_cfft_radix4by2_inverse_q15(
610         q15_t * pSrc,
611         uint32_t fftLen,
612   const q15_t * pCoef);
613 
614 /**
615   @ingroup groupTransforms
616  */
617 
618 /**
619   @addtogroup ComplexFFT
620   @{
621  */
622 
623 /**
624   @brief         Processing function for Q15 complex FFT.
625   @param[in]     S               points to an instance of Q15 CFFT structure
626   @param[in,out] p1              points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
627   @param[in]     ifftFlag       flag that selects transform direction
628                    - value = 0: forward transform
629                    - value = 1: inverse transform
630   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
631                    - value = 0: disables bit reversal of output
632                    - value = 1: enables bit reversal of output
633   @return        none
634  */
635 
arm_cfft_q15(const arm_cfft_instance_q15 * S,q15_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)636 void arm_cfft_q15(
637   const arm_cfft_instance_q15 * S,
638         q15_t * p1,
639         uint8_t ifftFlag,
640         uint8_t bitReverseFlag)
641 {
642   uint32_t L = S->fftLen;
643 
644   if (ifftFlag == 1U)
645   {
646      switch (L)
647      {
648      case 16:
649      case 64:
650      case 256:
651      case 1024:
652      case 4096:
653        arm_radix4_butterfly_inverse_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
654        break;
655 
656      case 32:
657      case 128:
658      case 512:
659      case 2048:
660        arm_cfft_radix4by2_inverse_q15 ( p1, L, S->pTwiddle );
661        break;
662      }
663   }
664   else
665   {
666      switch (L)
667      {
668      case 16:
669      case 64:
670      case 256:
671      case 1024:
672      case 4096:
673        arm_radix4_butterfly_q15  ( p1, L, (q15_t*)S->pTwiddle, 1 );
674        break;
675 
676      case 32:
677      case 128:
678      case 512:
679      case 2048:
680        arm_cfft_radix4by2_q15  ( p1, L, S->pTwiddle );
681        break;
682      }
683   }
684 
685   if ( bitReverseFlag )
686     arm_bitreversal_16 ((uint16_t*) p1, S->bitRevLength, S->pBitRevTable);
687 }
688 
689 /**
690   @} end of ComplexFFT group
691  */
692 
arm_cfft_radix4by2_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pCoef)693 void arm_cfft_radix4by2_q15(
694         q15_t * pSrc,
695         uint32_t fftLen,
696   const q15_t * pCoef)
697 {
698         uint32_t i;
699         uint32_t n2;
700         q15_t p0, p1, p2, p3;
701 #if defined (ARM_MATH_DSP)
702         q31_t T, S, R;
703         q31_t coeff, out1, out2;
704   const q15_t *pC = pCoef;
705         q15_t *pSi = pSrc;
706         q15_t *pSl = pSrc + fftLen;
707 #else
708         uint32_t l;
709         q15_t xt, yt, cosVal, sinVal;
710 #endif
711 
712   n2 = fftLen >> 1U;
713 
714 #if defined (ARM_MATH_DSP)
715 
716   for (i = n2; i > 0; i--)
717   {
718       coeff = read_q15x2_ia ((q15_t **) &pC);
719 
720       T = read_q15x2 (pSi);
721       T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
722 
723       S = read_q15x2 (pSl);
724       S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
725 
726       R = __QSUB16(T, S);
727 
728       write_q15x2_ia (&pSi, __SHADD16(T, S));
729 
730 #ifndef ARM_MATH_BIG_ENDIAN
731       out1 = __SMUAD(coeff, R) >> 16U;
732       out2 = __SMUSDX(coeff, R);
733 #else
734       out1 = __SMUSDX(R, coeff) >> 16U;
735       out2 = __SMUAD(coeff, R);
736 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
737 
738       write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ) );
739   }
740 
741 #else /* #if defined (ARM_MATH_DSP) */
742 
743   for (i = 0; i < n2; i++)
744   {
745      cosVal = pCoef[2 * i];
746      sinVal = pCoef[2 * i + 1];
747 
748      l = i + n2;
749 
750      xt =           (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
751      pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
752 
753      yt =               (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
754      pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
755 
756      pSrc[2 * l]     = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) +
757                         ((int16_t) (((q31_t) yt * sinVal) >> 16U))  );
758 
759      pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) -
760                         ((int16_t) (((q31_t) xt * sinVal) >> 16U))   );
761   }
762 
763 #endif /* #if defined (ARM_MATH_DSP) */
764 
765   /* first col */
766   arm_radix4_butterfly_q15( pSrc,          n2, (q15_t*)pCoef, 2U);
767 
768   /* second col */
769   arm_radix4_butterfly_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
770 
771   n2 = fftLen >> 1U;
772   for (i = 0; i < n2; i++)
773   {
774      p0 = pSrc[4 * i + 0];
775      p1 = pSrc[4 * i + 1];
776      p2 = pSrc[4 * i + 2];
777      p3 = pSrc[4 * i + 3];
778 
779      p0 <<= 1U;
780      p1 <<= 1U;
781      p2 <<= 1U;
782      p3 <<= 1U;
783 
784      pSrc[4 * i + 0] = p0;
785      pSrc[4 * i + 1] = p1;
786      pSrc[4 * i + 2] = p2;
787      pSrc[4 * i + 3] = p3;
788   }
789 
790 }
791 
arm_cfft_radix4by2_inverse_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pCoef)792 void arm_cfft_radix4by2_inverse_q15(
793         q15_t * pSrc,
794         uint32_t fftLen,
795   const q15_t * pCoef)
796 {
797         uint32_t i;
798         uint32_t n2;
799         q15_t p0, p1, p2, p3;
800 #if defined (ARM_MATH_DSP)
801         q31_t T, S, R;
802         q31_t coeff, out1, out2;
803   const q15_t *pC = pCoef;
804         q15_t *pSi = pSrc;
805         q15_t *pSl = pSrc + fftLen;
806 #else
807         uint32_t l;
808         q15_t xt, yt, cosVal, sinVal;
809 #endif
810 
811   n2 = fftLen >> 1U;
812 
813 #if defined (ARM_MATH_DSP)
814 
815   for (i = n2; i > 0; i--)
816   {
817      coeff = read_q15x2_ia ((q15_t **) &pC);
818 
819      T = read_q15x2 (pSi);
820      T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
821 
822      S = read_q15x2 (pSl);
823      S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
824 
825      R = __QSUB16(T, S);
826 
827      write_q15x2_ia (&pSi, __SHADD16(T, S));
828 
829 #ifndef ARM_MATH_BIG_ENDIAN
830      out1 = __SMUSD(coeff, R) >> 16U;
831      out2 = __SMUADX(coeff, R);
832 #else
833      out1 = __SMUADX(R, coeff) >> 16U;
834      out2 = __SMUSD(__QSUB(0, coeff), R);
835 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
836 
837      write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ));
838   }
839 
840 #else /* #if defined (ARM_MATH_DSP) */
841 
842   for (i = 0; i < n2; i++)
843   {
844      cosVal = pCoef[2 * i];
845      sinVal = pCoef[2 * i + 1];
846 
847      l = i + n2;
848 
849      xt =           (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
850      pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
851 
852      yt =               (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
853      pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
854 
855      pSrc[2 * l]      = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) -
856                          ((int16_t) (((q31_t) yt * sinVal) >> 16U))  );
857 
858      pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) +
859                         ((int16_t) (((q31_t) xt * sinVal) >> 16U))  );
860   }
861 
862 #endif /* #if defined (ARM_MATH_DSP) */
863 
864   /* first col */
865   arm_radix4_butterfly_inverse_q15( pSrc,          n2, (q15_t*)pCoef, 2U);
866 
867   /* second col */
868   arm_radix4_butterfly_inverse_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
869 
870   n2 = fftLen >> 1U;
871   for (i = 0; i < n2; i++)
872   {
873      p0 = pSrc[4 * i + 0];
874      p1 = pSrc[4 * i + 1];
875      p2 = pSrc[4 * i + 2];
876      p3 = pSrc[4 * i + 3];
877 
878      p0 <<= 1U;
879      p1 <<= 1U;
880      p2 <<= 1U;
881      p3 <<= 1U;
882 
883      pSrc[4 * i + 0] = p0;
884      pSrc[4 * i + 1] = p1;
885      pSrc[4 * i + 2] = p2;
886      pSrc[4 * i + 3] = p3;
887   }
888 }
889 
890 #endif /* defined(ARM_MATH_MVEI) */
891