1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_q15.c
4  * Description:  Combined Radix Decimation in Q15 Frequency CFFT processing function
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/transform_functions.h"
30 
31 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
32 
33 #include "arm_vec_fft.h"
34 
35 
_arm_radix4_butterfly_q15_mve(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint32_t fftLen)36 static void _arm_radix4_butterfly_q15_mve(
37     const arm_cfft_instance_q15 * S,
38     q15_t   *pSrc,
39     uint32_t fftLen)
40 {
41     q15x8_t vecTmp0, vecTmp1;
42     q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
43     q15x8_t vecA, vecB, vecC, vecD;
44     uint32_t  blkCnt;
45     uint32_t  n1, n2;
46     uint32_t  stage = 0;
47     int32_t  iter = 1;
48     static const int32_t strides[4] = {
49         (0 - 16) * (int32_t)sizeof(q15_t *), (4 - 16) * (int32_t)sizeof(q15_t *),
50         (8 - 16) * (int32_t)sizeof(q15_t *), (12 - 16) * (int32_t)sizeof(q15_t *)
51     };
52 
53     /*
54      * Process first stages
55      * Each stage in middle stages provides two down scaling of the input
56      */
57     n2 = fftLen;
58     n1 = n2;
59     n2 >>= 2u;
60 
61     for (int k = fftLen / 4u; k > 1; k >>= 2u)
62     {
63         q15_t const *p_rearranged_twiddle_tab_stride2 =
64             &S->rearranged_twiddle_stride2[
65             S->rearranged_twiddle_tab_stride2_arr[stage]];
66         q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
67             S->rearranged_twiddle_tab_stride3_arr[stage]];
68         q15_t const *p_rearranged_twiddle_tab_stride1 =
69             &S->rearranged_twiddle_stride1[
70             S->rearranged_twiddle_tab_stride1_arr[stage]];
71 
72         q15_t * pBase = pSrc;
73         for (int i = 0; i < iter; i++)
74         {
75             q15_t    *inA = pBase;
76             q15_t    *inB = inA + n2 * CMPLX_DIM;
77             q15_t    *inC = inB + n2 * CMPLX_DIM;
78             q15_t    *inD = inC + n2 * CMPLX_DIM;
79             q15_t const *pW1 = p_rearranged_twiddle_tab_stride1;
80             q15_t const *pW2 = p_rearranged_twiddle_tab_stride2;
81             q15_t const *pW3 = p_rearranged_twiddle_tab_stride3;
82             q15x8_t    vecW;
83 
84             blkCnt = n2 / 4;
85             /*
86              * load 4 x q15 complex pair
87              */
88             vecA = vldrhq_s16(inA);
89             vecC = vldrhq_s16(inC);
90             while (blkCnt > 0U)
91             {
92                 vecB = vldrhq_s16(inB);
93                 vecD = vldrhq_s16(inD);
94 
95                 vecSum0 = vhaddq(vecA, vecC);
96                 vecDiff0 = vhsubq(vecA, vecC);
97 
98                 vecSum1 = vhaddq(vecB, vecD);
99                 vecDiff1 = vhsubq(vecB, vecD);
100                 /*
101                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
102                  */
103                 vecTmp0 = vhaddq(vecSum0, vecSum1);
104                 vst1q(inA, vecTmp0);
105                 inA += 8;
106                 /*
107                  * [ 1 -1 1 -1 ] * [ A B C D ]'
108                  */
109                 vecTmp0 = vhsubq(vecSum0, vecSum1);
110                 /*
111                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
112                  */
113                 vecW = vld1q(pW2);
114                 pW2 += 8;
115                 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q15x8_t);
116 
117                 vst1q(inB, vecTmp1);
118                 inB += 8;
119                 /*
120                  * [ 1 -i -1 +i ] * [ A B C D ]'
121                  */
122                 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
123                 /*
124                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
125                  */
126                 vecW = vld1q(pW1);
127                 pW1 += 8;
128                 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q15x8_t);
129                 vst1q(inC, vecTmp1);
130                 inC += 8;
131 
132                 /*
133                  * [ 1 +i -1 -i ] * [ A B C D ]'
134                  */
135                 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
136                 /*
137                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
138                  */
139                 vecW = vld1q(pW3);
140                 pW3 += 8;
141                 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q15x8_t);
142                 vst1q(inD, vecTmp1);
143                 inD += 8;
144 
145                 vecA = vldrhq_s16(inA);
146                 vecC = vldrhq_s16(inC);
147 
148                 blkCnt--;
149             }
150             pBase +=  CMPLX_DIM * n1;
151         }
152         n1 = n2;
153         n2 >>= 2u;
154         iter = iter << 2;
155         stage++;
156     }
157 
158     /*
159      * start of Last stage process
160      */
161     uint32x4_t vecScGathAddr = vld1q_u32 ((uint32_t*)strides);
162     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
163 
164     /*
165      * load scheduling
166      */
167     vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
168     vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
169 
170     blkCnt = (fftLen >> 4);
171     while (blkCnt > 0U)
172     {
173         vecSum0 = vhaddq(vecA, vecC);
174         vecDiff0 = vhsubq(vecA, vecC);
175 
176         vecB = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 4);
177         vecD = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 12);
178 
179         vecSum1 = vhaddq(vecB, vecD);
180         vecDiff1 = vhsubq(vecB, vecD);
181         /*
182          * pre-load for next iteration
183          */
184         vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
185         vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
186 
187         vecTmp0 = vhaddq(vecSum0, vecSum1);
188         vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
189 
190         vecTmp0 = vhsubq(vecSum0, vecSum1);
191         vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
192 
193         vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
194         vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
195 
196         vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
197         vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
198 
199         blkCnt--;
200     }
201 
202 }
203 
arm_cfft_radix4by2_q15_mve(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint32_t fftLen)204 static void arm_cfft_radix4by2_q15_mve(const arm_cfft_instance_q15 *S, q15_t *pSrc, uint32_t fftLen)
205 {
206     uint32_t n2;
207     q15_t *pIn0;
208     q15_t *pIn1;
209     const q15_t *pCoef = S->pTwiddle;
210     uint32_t     blkCnt;
211     q15x8_t    vecIn0, vecIn1, vecSum, vecDiff;
212     q15x8_t    vecCmplxTmp, vecTw;
213     q15_t  const *pCoefVec;
214 
215     n2 = fftLen >> 1;
216 
217     pIn0 = pSrc;
218     pIn1 = pSrc + fftLen;
219     pCoefVec = pCoef;
220 
221     blkCnt = n2 / 4;
222 
223     while (blkCnt > 0U)
224     {
225         vecIn0 = *(q15x8_t *) pIn0;
226         vecIn1 = *(q15x8_t *) pIn1;
227 
228         vecIn0 = vecIn0 >> 1;
229         vecIn1 = vecIn1 >> 1;
230         vecSum = vhaddq(vecIn0, vecIn1);
231         vst1q(pIn0, vecSum);
232         pIn0 += 8;
233 
234         vecTw = vld1q(pCoefVec);
235         pCoefVec += 8;
236 
237         vecDiff = vhsubq(vecIn0, vecIn1);
238         vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw, q15x8_t);
239         vst1q(pIn1, vecCmplxTmp);
240         pIn1 += 8;
241 
242         blkCnt--;
243     }
244 
245     _arm_radix4_butterfly_q15_mve(S, pSrc, n2);
246 
247     _arm_radix4_butterfly_q15_mve(S, pSrc + fftLen, n2);
248 
249 
250     pIn0 = pSrc;
251     blkCnt = (fftLen << 1) >> 3;
252     while (blkCnt > 0U)
253     {
254         vecIn0 = *(q15x8_t *) pIn0;
255         vecIn0 = vecIn0 << 1;
256         vst1q(pIn0, vecIn0);
257         pIn0 += 8;
258         blkCnt--;
259     }
260     /*
261      * tail
262      * (will be merged thru tail predication)
263      */
264     blkCnt = (fftLen << 1) & 7;
265     if (blkCnt > 0U)
266     {
267         mve_pred16_t p0 = vctp16q(blkCnt);
268 
269         vecIn0 = *(q15x8_t *) pIn0;
270         vecIn0 = vecIn0 << 1;
271         vstrhq_p(pIn0, vecIn0, p0);
272     }
273 }
274 
_arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint32_t fftLen)275 static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S,q15_t *pSrc, uint32_t fftLen)
276 {
277     q15x8_t vecTmp0, vecTmp1;
278     q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
279     q15x8_t vecA, vecB, vecC, vecD;
280     uint32_t  blkCnt;
281     uint32_t  n1, n2;
282     uint32_t  stage = 0;
283     int32_t  iter = 1;
284     static const int32_t strides[4] = {
285         (0 - 16) * (int32_t)sizeof(q15_t *), (4 - 16) * (int32_t)sizeof(q15_t *),
286         (8 - 16) * (int32_t)sizeof(q15_t *), (12 - 16) * (int32_t)sizeof(q15_t *)
287     };
288 
289 
290     /*
291      * Process first stages
292      * Each stage in middle stages provides two down scaling of the input
293      */
294     n2 = fftLen;
295     n1 = n2;
296     n2 >>= 2u;
297 
298     for (int k = fftLen / 4u; k > 1; k >>= 2u)
299     {
300         q15_t const *p_rearranged_twiddle_tab_stride2 =
301             &S->rearranged_twiddle_stride2[
302             S->rearranged_twiddle_tab_stride2_arr[stage]];
303         q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
304             S->rearranged_twiddle_tab_stride3_arr[stage]];
305         q15_t const *p_rearranged_twiddle_tab_stride1 =
306             &S->rearranged_twiddle_stride1[
307             S->rearranged_twiddle_tab_stride1_arr[stage]];
308 
309         q15_t * pBase = pSrc;
310         for (int i = 0; i < iter; i++)
311         {
312             q15_t    *inA = pBase;
313             q15_t    *inB = inA + n2 * CMPLX_DIM;
314             q15_t    *inC = inB + n2 * CMPLX_DIM;
315             q15_t    *inD = inC + n2 * CMPLX_DIM;
316             q15_t const *pW1 = p_rearranged_twiddle_tab_stride1;
317             q15_t const *pW2 = p_rearranged_twiddle_tab_stride2;
318             q15_t const *pW3 = p_rearranged_twiddle_tab_stride3;
319             q15x8_t    vecW;
320 
321 
322             blkCnt = n2 / 4;
323             /*
324              * load 4 x q15 complex pair
325              */
326             vecA = vldrhq_s16(inA);
327             vecC = vldrhq_s16(inC);
328             while (blkCnt > 0U)
329             {
330                 vecB = vldrhq_s16(inB);
331                 vecD = vldrhq_s16(inD);
332 
333                 vecSum0 = vhaddq(vecA, vecC);
334                 vecDiff0 = vhsubq(vecA, vecC);
335 
336                 vecSum1 = vhaddq(vecB, vecD);
337                 vecDiff1 = vhsubq(vecB, vecD);
338                 /*
339                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
340                  */
341                 vecTmp0 = vhaddq(vecSum0, vecSum1);
342                 vst1q(inA, vecTmp0);
343                 inA += 8;
344                 /*
345                  * [ 1 -1 1 -1 ] * [ A B C D ]'
346                  */
347                 vecTmp0 = vhsubq(vecSum0, vecSum1);
348                 /*
349                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
350                  */
351                 vecW = vld1q(pW2);
352                 pW2 += 8;
353                 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q15x8_t);
354 
355                 vst1q(inB, vecTmp1);
356                 inB += 8;
357                 /*
358                  * [ 1 -i -1 +i ] * [ A B C D ]'
359                  */
360                 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
361                 /*
362                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
363                  */
364                 vecW = vld1q(pW1);
365                 pW1 += 8;
366                 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q15x8_t);
367                 vst1q(inC, vecTmp1);
368                 inC += 8;
369                 /*
370                  * [ 1 +i -1 -i ] * [ A B C D ]'
371                  */
372                 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
373                 /*
374                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
375                  */
376                 vecW = vld1q(pW3);
377                 pW3 += 8;
378                 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q15x8_t);
379                 vst1q(inD, vecTmp1);
380                 inD += 8;
381 
382                 vecA = vldrhq_s16(inA);
383                 vecC = vldrhq_s16(inC);
384 
385                 blkCnt--;
386             }
387             pBase +=  CMPLX_DIM * n1;
388         }
389         n1 = n2;
390         n2 >>= 2u;
391         iter = iter << 2;
392         stage++;
393     }
394 
395     /*
396      * start of Last stage process
397      */
398     uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
399     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
400 
401     /*
402      * load scheduling
403      */
404     vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
405     vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
406 
407     blkCnt = (fftLen >> 4);
408     while (blkCnt > 0U)
409     {
410         vecSum0 = vhaddq(vecA, vecC);
411         vecDiff0 = vhsubq(vecA, vecC);
412 
413         vecB = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 4);
414         vecD = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 12);
415 
416         vecSum1 = vhaddq(vecB, vecD);
417         vecDiff1 = vhsubq(vecB, vecD);
418         /*
419          * pre-load for next iteration
420          */
421         vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
422         vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
423 
424         vecTmp0 = vhaddq(vecSum0, vecSum1);
425         vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
426 
427         vecTmp0 = vhsubq(vecSum0, vecSum1);
428         vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
429 
430         vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
431         vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
432 
433         vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
434         vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
435 
436         blkCnt--;
437     }
438 }
439 
arm_cfft_radix4by2_inverse_q15_mve(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint32_t fftLen)440 static void arm_cfft_radix4by2_inverse_q15_mve(const arm_cfft_instance_q15 *S, q15_t *pSrc, uint32_t fftLen)
441 {
442     uint32_t n2;
443     q15_t *pIn0;
444     q15_t *pIn1;
445     const q15_t *pCoef = S->pTwiddle;
446 
447     uint32_t     blkCnt;
448     q15x8_t    vecIn0, vecIn1, vecSum, vecDiff;
449     q15x8_t    vecCmplxTmp, vecTw;
450     q15_t  const *pCoefVec;
451 
452     n2 = fftLen >> 1;
453 
454     pIn0 = pSrc;
455     pIn1 = pSrc + fftLen;
456     pCoefVec = pCoef;
457 
458     blkCnt = n2 / 4;
459 
460     while (blkCnt > 0U)
461     {
462         vecIn0 = *(q15x8_t *) pIn0;
463         vecIn1 = *(q15x8_t *) pIn1;
464 
465         vecIn0 = vecIn0 >> 1;
466         vecIn1 = vecIn1 >> 1;
467         vecSum = vhaddq(vecIn0, vecIn1);
468         vst1q(pIn0, vecSum);
469         pIn0 += 8;
470 
471         vecTw = vld1q(pCoefVec);
472         pCoefVec += 8;
473 
474         vecDiff = vhsubq(vecIn0, vecIn1);
475         vecCmplxTmp = vqrdmlsdhq(vuninitializedq_s16() , vecDiff, vecTw);
476         vecCmplxTmp = vqrdmladhxq(vecCmplxTmp, vecDiff, vecTw);
477         vst1q(pIn1, vecCmplxTmp);
478         pIn1 += 8;
479 
480         blkCnt--;
481     }
482 
483 
484     _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, n2);
485 
486     _arm_radix4_butterfly_inverse_q15_mve(S, pSrc + fftLen, n2);
487 
488     pIn0 = pSrc;
489     blkCnt = (fftLen << 1) >> 3;
490     while (blkCnt > 0U)
491     {
492         vecIn0 = *(q15x8_t *) pIn0;
493         vecIn0 = vecIn0 << 1;
494         vst1q(pIn0, vecIn0);
495         pIn0 += 8;
496         blkCnt--;
497     }
498     /*
499      * tail
500      * (will be merged thru tail predication)
501      */
502     blkCnt = (fftLen << 1) & 7;
503     while (blkCnt > 0U)
504     {
505         mve_pred16_t p0 = vctp16q(blkCnt);
506 
507         vecIn0 = *(q15x8_t *) pIn0;
508         vecIn0 = vecIn0 << 1;
509         vstrhq_p(pIn0, vecIn0, p0);
510     }
511 }
512 
513 /**
514   @addtogroup ComplexFFTQ15
515   @{
516  */
517 
518 /**
519   @brief         Processing function for Q15 complex FFT.
520   @param[in]     S               points to an instance of Q15 CFFT structure
521   @param[in,out] p1              points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
522   @param[in]     ifftFlag       flag that selects transform direction
523                    - value = 0: forward transform
524                    - value = 1: inverse transform
525   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
526                    - value = 0: disables bit reversal of output
527                    - value = 1: enables bit reversal of output
528  */
arm_cfft_q15(const arm_cfft_instance_q15 * S,q15_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)529 ARM_DSP_ATTRIBUTE void arm_cfft_q15(
530   const arm_cfft_instance_q15 * S,
531         q15_t * pSrc,
532         uint8_t ifftFlag,
533         uint8_t bitReverseFlag)
534 {
535         uint32_t fftLen = S->fftLen;
536 
537         if (ifftFlag == 1U) {
538 
539             switch (fftLen) {
540             case 16:
541             case 64:
542             case 256:
543             case 1024:
544             case 4096:
545                 _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
546                 break;
547 
548             case 32:
549             case 128:
550             case 512:
551             case 2048:
552                 arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
553                 break;
554             }
555         } else {
556             switch (fftLen) {
557             case 16:
558             case 64:
559             case 256:
560             case 1024:
561             case 4096:
562                 _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
563                 break;
564 
565             case 32:
566             case 128:
567             case 512:
568             case 2048:
569                 arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
570                 break;
571             }
572         }
573 
574 
575         if (bitReverseFlag)
576         {
577 
578             arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
579 
580         }
581 }
582 
583 #else
584 
585 extern void arm_radix4_butterfly_q15(
586         q15_t * pSrc,
587         uint32_t fftLen,
588   const q15_t * pCoef,
589         uint32_t twidCoefModifier);
590 
591 extern void arm_radix4_butterfly_inverse_q15(
592         q15_t * pSrc,
593         uint32_t fftLen,
594   const q15_t * pCoef,
595         uint32_t twidCoefModifier);
596 
597 extern void arm_bitreversal_16(
598         uint16_t * pSrc,
599   const uint16_t bitRevLen,
600   const uint16_t * pBitRevTable);
601 
602 ARM_DSP_ATTRIBUTE void arm_cfft_radix4by2_q15(
603         q15_t * pSrc,
604         uint32_t fftLen,
605   const q15_t * pCoef);
606 
607 ARM_DSP_ATTRIBUTE void arm_cfft_radix4by2_inverse_q15(
608         q15_t * pSrc,
609         uint32_t fftLen,
610   const q15_t * pCoef);
611 
612 
613 
614 /**
615   @addtogroup ComplexFFTQ15
616   @{
617  */
618 
619 /**
620   @brief         Processing function for Q15 complex FFT.
621   @param[in]     S               points to an instance of Q15 CFFT structure
622   @param[in,out] p1              points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
623   @param[in]     ifftFlag       flag that selects transform direction
624                    - value = 0: forward transform
625                    - value = 1: inverse transform
626   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
627                    - value = 0: disables bit reversal of output
628                    - value = 1: enables bit reversal of output
629  */
630 
arm_cfft_q15(const arm_cfft_instance_q15 * S,q15_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)631 ARM_DSP_ATTRIBUTE void arm_cfft_q15(
632   const arm_cfft_instance_q15 * S,
633         q15_t * p1,
634         uint8_t ifftFlag,
635         uint8_t bitReverseFlag)
636 {
637   uint32_t L = S->fftLen;
638 
639   if (ifftFlag == 1U)
640   {
641      switch (L)
642      {
643      case 16:
644      case 64:
645      case 256:
646      case 1024:
647      case 4096:
648        arm_radix4_butterfly_inverse_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
649        break;
650 
651      case 32:
652      case 128:
653      case 512:
654      case 2048:
655        arm_cfft_radix4by2_inverse_q15 ( p1, L, S->pTwiddle );
656        break;
657      }
658   }
659   else
660   {
661      switch (L)
662      {
663      case 16:
664      case 64:
665      case 256:
666      case 1024:
667      case 4096:
668        arm_radix4_butterfly_q15  ( p1, L, (q15_t*)S->pTwiddle, 1 );
669        break;
670 
671      case 32:
672      case 128:
673      case 512:
674      case 2048:
675        arm_cfft_radix4by2_q15  ( p1, L, S->pTwiddle );
676        break;
677      }
678   }
679 
680   if ( bitReverseFlag )
681     arm_bitreversal_16 ((uint16_t*) p1, S->bitRevLength, S->pBitRevTable);
682 }
683 
684 /**
685   @} end of ComplexFFTQ15 group
686  */
687 
arm_cfft_radix4by2_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pCoef)688 ARM_DSP_ATTRIBUTE void arm_cfft_radix4by2_q15(
689         q15_t * pSrc,
690         uint32_t fftLen,
691   const q15_t * pCoef)
692 {
693         uint32_t i;
694         uint32_t n2;
695         q15_t p0, p1, p2, p3;
696 #if defined (ARM_MATH_DSP)
697         q31_t T, S, R;
698         q31_t coeff, out1, out2;
699   const q15_t *pC = pCoef;
700         q15_t *pSi = pSrc;
701         q15_t *pSl = pSrc + fftLen;
702 #else
703         uint32_t l;
704         q15_t xt, yt, cosVal, sinVal;
705 #endif
706 
707   n2 = fftLen >> 1U;
708 
709 #if defined (ARM_MATH_DSP)
710 
711   for (i = n2; i > 0; i--)
712   {
713       coeff = read_q15x2_ia (&pC);
714 
715       T = read_q15x2 (pSi);
716       T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
717 
718       S = read_q15x2 (pSl);
719       S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
720 
721       R = __QSUB16(T, S);
722 
723       write_q15x2_ia (&pSi, __SHADD16(T, S));
724 
725 #ifndef ARM_MATH_BIG_ENDIAN
726       out1 = __SMUAD(coeff, R) >> 16U;
727       out2 = __SMUSDX(coeff, R);
728 #else
729       out1 = __SMUSDX(R, coeff) >> 16U;
730       out2 = __SMUAD(coeff, R);
731 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
732 
733       write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ) );
734   }
735 
736 #else /* #if defined (ARM_MATH_DSP) */
737 
738   for (i = 0; i < n2; i++)
739   {
740      cosVal = pCoef[2 * i];
741      sinVal = pCoef[2 * i + 1];
742 
743      l = i + n2;
744 
745      xt =           (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
746      pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
747 
748      yt =               (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
749      pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
750 
751      pSrc[2 * l]     = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) +
752                         ((int16_t) (((q31_t) yt * sinVal) >> 16U))  );
753 
754      pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) -
755                         ((int16_t) (((q31_t) xt * sinVal) >> 16U))   );
756   }
757 
758 #endif /* #if defined (ARM_MATH_DSP) */
759 
760   /* first col */
761   arm_radix4_butterfly_q15( pSrc,          n2, (q15_t*)pCoef, 2U);
762 
763   /* second col */
764   arm_radix4_butterfly_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
765 
766   n2 = fftLen >> 1U;
767   for (i = 0; i < n2; i++)
768   {
769      p0 = pSrc[4 * i + 0];
770      p1 = pSrc[4 * i + 1];
771      p2 = pSrc[4 * i + 2];
772      p3 = pSrc[4 * i + 3];
773 
774      p0 <<= 1U;
775      p1 <<= 1U;
776      p2 <<= 1U;
777      p3 <<= 1U;
778 
779      pSrc[4 * i + 0] = p0;
780      pSrc[4 * i + 1] = p1;
781      pSrc[4 * i + 2] = p2;
782      pSrc[4 * i + 3] = p3;
783   }
784 
785 }
786 
arm_cfft_radix4by2_inverse_q15(q15_t * pSrc,uint32_t fftLen,const q15_t * pCoef)787 ARM_DSP_ATTRIBUTE void arm_cfft_radix4by2_inverse_q15(
788         q15_t * pSrc,
789         uint32_t fftLen,
790   const q15_t * pCoef)
791 {
792         uint32_t i;
793         uint32_t n2;
794         q15_t p0, p1, p2, p3;
795 #if defined (ARM_MATH_DSP)
796         q31_t T, S, R;
797         q31_t coeff, out1, out2;
798   const q15_t *pC = pCoef;
799         q15_t *pSi = pSrc;
800         q15_t *pSl = pSrc + fftLen;
801 #else
802         uint32_t l;
803         q15_t xt, yt, cosVal, sinVal;
804 #endif
805 
806   n2 = fftLen >> 1U;
807 
808 #if defined (ARM_MATH_DSP)
809 
810   for (i = n2; i > 0; i--)
811   {
812      coeff = read_q15x2_ia (&pC);
813 
814      T = read_q15x2 (pSi);
815      T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
816 
817      S = read_q15x2 (pSl);
818      S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
819 
820      R = __QSUB16(T, S);
821 
822      write_q15x2_ia (&pSi, __SHADD16(T, S));
823 
824 #ifndef ARM_MATH_BIG_ENDIAN
825      out1 = __SMUSD(coeff, R) >> 16U;
826      out2 = __SMUADX(coeff, R);
827 #else
828      out1 = __SMUADX(R, coeff) >> 16U;
829      out2 = __SMUSD(__QSUB(0, coeff), R);
830 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
831 
832      write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ));
833   }
834 
835 #else /* #if defined (ARM_MATH_DSP) */
836 
837   for (i = 0; i < n2; i++)
838   {
839      cosVal = pCoef[2 * i];
840      sinVal = pCoef[2 * i + 1];
841 
842      l = i + n2;
843 
844      xt =           (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
845      pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
846 
847      yt =               (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
848      pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
849 
850      pSrc[2 * l]      = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) -
851                          ((int16_t) (((q31_t) yt * sinVal) >> 16U))  );
852 
853      pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) +
854                         ((int16_t) (((q31_t) xt * sinVal) >> 16U))  );
855   }
856 
857 #endif /* #if defined (ARM_MATH_DSP) */
858 
859   /* first col */
860   arm_radix4_butterfly_inverse_q15( pSrc,          n2, (q15_t*)pCoef, 2U);
861 
862   /* second col */
863   arm_radix4_butterfly_inverse_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
864 
865   n2 = fftLen >> 1U;
866   for (i = 0; i < n2; i++)
867   {
868      p0 = pSrc[4 * i + 0];
869      p1 = pSrc[4 * i + 1];
870      p2 = pSrc[4 * i + 2];
871      p3 = pSrc[4 * i + 3];
872 
873      p0 <<= 1U;
874      p1 <<= 1U;
875      p2 <<= 1U;
876      p3 <<= 1U;
877 
878      pSrc[4 * i + 0] = p0;
879      pSrc[4 * i + 1] = p1;
880      pSrc[4 * i + 2] = p2;
881      pSrc[4 * i + 3] = p3;
882   }
883 }
884 
885 #endif /* defined(ARM_MATH_MVEI) */
886