1 /******************************************************************************
2  * @file     arm_vec_fft.h
3  * @brief    Private header file for CMSIS DSP Library
4  * @version  V1.7.0
5  * @date     07. January 2020
6  ******************************************************************************/
7 /*
8  * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
9  *
10  * SPDX-License-Identifier: Apache-2.0
11  *
12  * Licensed under the Apache License, Version 2.0 (the License); you may
13  * not use this file except in compliance with the License.
14  * You may obtain a copy of the License at
15  *
16  * www.apache.org/licenses/LICENSE-2.0
17  *
18  * Unless required by applicable law or agreed to in writing, software
19  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
20  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  * See the License for the specific language governing permissions and
22  * limitations under the License.
23  */
24 
25 #ifndef _ARM_VEC_FFT_H_
26 #define _ARM_VEC_FFT_H_
27 
28 #include "arm_math.h"
29 #include "arm_helium_utils.h"
30 
31 #ifdef   __cplusplus
32 extern "C"
33 {
34 #endif
35 
36 #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
37 
38 #define MVE_CMPLX_ADD_A_ixB(A, B)           vcaddq_rot90(A,B)
39 #define MVE_CMPLX_SUB_A_ixB(A,B)            vcaddq_rot270(A,B)
40 #define MVE_CMPLX_MULT_FLT_AxB(A,B)         vcmlaq_rot90(vcmulq(A, B), A, B)
41 #define MVE_CMPLX_MULT_FLT_Conj_AxB(A,B)    vcmlaq_rot270(vcmulq(A, B), A, B)
42 
43 #define MVE_CMPLX_MULT_FX_AxB(A,B)          vqdmladhxq(vqdmlsdhq((__typeof(A))vuninitializedq_s32(), A, B), A, B)
44 #define MVE_CMPLX_MULT_FX_AxConjB(A,B)      vqdmladhq(vqdmlsdhxq((__typeof(A))vuninitializedq_s32(), A, B), A, B)
45 
46 #define MVE_CMPLX_ADD_FX_A_ixB(A, B)        vhcaddq_rot90(A,B)
47 #define MVE_CMPLX_SUB_FX_A_ixB(A,B)         vhcaddq_rot270(A,B)
48 
49 
50 /**
51   @brief         In-place 32 bit reversal function for helium
52   @param[in,out] pSrc        points to in-place buffer of unknown 32-bit data type
53   @param[in]     bitRevLen   bit reversal table length
54   @param[in]     pBitRevTab  points to bit reversal table
55   @return        none
56 */
57 
arm_bitreversal_32_inpl_mve(uint32_t * pSrc,const uint16_t bitRevLen,const uint16_t * pBitRevTab)58 __STATIC_INLINE void arm_bitreversal_32_inpl_mve(
59         uint32_t *pSrc,
60   const uint16_t  bitRevLen,
61   const uint16_t *pBitRevTab)
62 
63 {
64     uint64_t       *src = (uint64_t *) pSrc;
65     int32_t         blkCnt;     /* loop counters */
66     uint32x4_t      bitRevTabOff;
67     uint32x4_t      one = vdupq_n_u32(1);
68     uint64x2_t      inLow, inHigh;
69     uint64x2_t      bitRevOff1Low, bitRevOff0Low;
70     uint64x2_t      bitRevOff1High, bitRevOff0High;
71 
72     /* load scheduling to increase gather load idx update / gather load distance */
73     bitRevTabOff = vldrhq_u32(pBitRevTab);
74     pBitRevTab += 4;
75 
76     bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
77     bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
78 
79 
80     blkCnt = bitRevLen / 8;
81     while (blkCnt > 0) {
82         bitRevTabOff = vldrhq_u32(pBitRevTab);
83         pBitRevTab += 4;
84 
85         /* 64-bit index expansion */
86         bitRevOff1Low = vmullbq_int_u32(bitRevTabOff, one);
87         bitRevOff1High = vmulltq_int_u32(bitRevTabOff, one);
88 
89         inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
90         inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
91 
92         vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
93         vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
94 
95 
96         /* unrolled */
97         bitRevTabOff = vldrhq_u32(pBitRevTab);
98         pBitRevTab += 4;
99 
100         bitRevOff0Low = vmullbq_int_u32(bitRevTabOff, one);
101         bitRevOff0High = vmulltq_int_u32(bitRevTabOff, one);
102 
103         inLow = vldrdq_gather_offset_u64(src, bitRevOff1Low);
104         inHigh = vldrdq_gather_offset_u64(src, bitRevOff1High);
105 
106         vstrdq_scatter_offset_u64(src, bitRevOff1Low, inHigh);
107         vstrdq_scatter_offset_u64(src, bitRevOff1High, inLow);
108 
109         /*
110          * Decrement the blockSize loop counter
111          */
112         blkCnt--;
113     }
114 
115     if (bitRevLen & 7) {
116         /* FFT size = 16 */
117         inLow = vldrdq_gather_offset_u64(src, bitRevOff0Low);
118         inHigh = vldrdq_gather_offset_u64(src, bitRevOff0High);
119 
120         vstrdq_scatter_offset_u64(src, bitRevOff0Low, inHigh);
121         vstrdq_scatter_offset_u64(src, bitRevOff0High, inLow);
122     }
123 }
124 
125 
126 
127 /**
128   @brief         In-place 16 bit reversal function for helium
129   @param[in,out] pSrc        points to in-place buffer of unknown 16-bit data type
130   @param[in]     bitRevLen   bit reversal table length
131   @param[in]     pBitRevTab  points to bit reversal table
132   @return        none
133 */
134 
arm_bitreversal_16_inpl_mve(uint16_t * pSrc,const uint16_t bitRevLen,const uint16_t * pBitRevTab)135 __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
136         uint16_t *pSrc,
137   const uint16_t bitRevLen,
138   const uint16_t *pBitRevTab)
139 
140 {
141     uint32_t       *src = (uint32_t *) pSrc;
142     int32_t         blkCnt;     /* loop counters */
143     uint16x8_t      bitRevTabOff;
144     uint16x8_t      one = vdupq_n_u16(1);
145     uint32x4_t      bitRevOff1Low, bitRevOff0Low;
146     uint32x4_t      bitRevOff1High, bitRevOff0High;
147     uint32x4_t      inLow, inHigh;
148 
149     /* load scheduling to increase gather load idx update / gather load distance */
150     bitRevTabOff = vldrhq_u16(pBitRevTab);
151     pBitRevTab += 8;
152 
153     bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
154     bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
155     bitRevOff0Low = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
156     bitRevOff0High = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
157 
158     blkCnt = (bitRevLen / 16);
159     while (blkCnt > 0) {
160         bitRevTabOff = vldrhq_u16(pBitRevTab);
161         pBitRevTab += 8;
162 
163         bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one);
164         bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one);
165         bitRevOff1Low = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff1Low, 3);
166         bitRevOff1High = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff1High, 3);
167 
168         inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
169         inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
170 
171         vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
172         vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
173 
174         /* loop unrolling */
175         bitRevTabOff = vldrhq_u16(pBitRevTab);
176         pBitRevTab += 8;
177 
178         bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
179         bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
180         bitRevOff0Low = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
181         bitRevOff0High = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
182 
183         inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
184         inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
185 
186         vstrwq_scatter_shifted_offset_u32(src, bitRevOff1Low, inHigh);
187         vstrwq_scatter_shifted_offset_u32(src, bitRevOff1High, inLow);
188 
189         blkCnt--;
190     }
191 
192     /* tail handling */
193     blkCnt = bitRevLen & 0xf;
194     if (blkCnt == 8) {
195         inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
196         inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
197 
198         vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
199         vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
200     } else if (blkCnt == 12) {
201         /* FFT 16 special case */
202         mve_pred16_t    p = vctp16q(4);
203 
204         bitRevTabOff = vldrhq_z_u16(pBitRevTab, p);
205 
206         inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
207         inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
208 
209         vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
210         vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
211 
212         bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
213         bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
214         bitRevOff0Low = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
215         bitRevOff0High = (uint32x4_t)vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
216 
217         inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
218         inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
219 
220         vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0Low, inHigh, p);
221         vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff0High, inLow, p);
222     }
223 }
224 
225 /**
226   @brief         Out-of-place 32 bit reversal function for helium
227   @param[out]   pDst        points to destination buffer of unknown 32-bit data type
228   @param[in]    pSrc        points to input buffer of unknown 32-bit data type
229   @param[in]    fftLen      FFT length
230   @return       none
231 */
arm_bitreversal_32_outpl_mve(void * pDst,void * pSrc,uint32_t fftLen)232 __STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
233 {
234     uint32x4_t      idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
235     uint32_t        bitRevPos, blkCnt;
236     uint32_t       *pDst32 = (uint32_t *) pDst;
237 
238     /* fwd indexes */
239     idxOffs0 = vdupq_n_u32(0);
240     idxOffs1 = vdupq_n_u32(0);
241     idxOffs0[0] = 0;    idxOffs0[2] = 4;
242     idxOffs1[0] = 8;    idxOffs1[2] = 12;
243 
244     bitRevPos = (31 - __CLZ(fftLen)) + 5;
245     blkCnt = fftLen >> 2;
246 
247     /* issued earlier to increase gather load idx update / gather load distance */
248     /* bit-reverse fwd indexes */
249     bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
250     bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
251     while (blkCnt > 0) {
252         uint64x2_t      vecIn;
253 
254         vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs0);
255         idxOffs0 = idxOffs0 + 16;
256         vst1q(pDst32, (uint32x4_t) vecIn);
257         pDst32 += 4;
258         bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
259 
260         vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs1);
261         idxOffs1 = idxOffs1 + 16;
262         vst1q(pDst32, (uint32x4_t) vecIn);
263         pDst32 += 4;
264         bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
265 
266         blkCnt--;
267     }
268 }
269 
270 
271 /**
272   @brief         Out-of-place 16 bit reversal function for helium
273   @param[out]   pDst        points to destination buffer of unknown 16-bit data type
274   @param[in]    pSrc        points to input buffer of unknown 16-bit data type
275   @param[in]    fftLen      FFT length
276   @return       none
277 */
278 
arm_bitreversal_16_outpl_mve(void * pDst,void * pSrc,uint32_t fftLen)279 __STATIC_INLINE void arm_bitreversal_16_outpl_mve(void *pDst, void *pSrc, uint32_t fftLen)
280 {
281     uint32x4_t      idxOffs0, idxOffs1, bitRevOffs0, bitRevOffs1;
282     uint32_t        bitRevPos, blkCnt;
283     uint16_t       *pDst16 = (uint16_t *) pDst;
284     uint32_t        incrIdx = 0;
285 
286     /* fwd indexes */
287     idxOffs0 = vidupq_wb_u32(&incrIdx, 4);    // {0, 4, 8, 12}
288     idxOffs1 = vidupq_wb_u32(&incrIdx, 4);    // {16, 20, 24, 28}
289 
290     bitRevPos = (31 - __CLZ(fftLen)) + 4;
291     blkCnt = fftLen >> 3;
292 
293     /* issued earlier to increase gather load idx update / gather load distance */
294     /* bit-reverse fwd indexes */
295     bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
296     bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
297     while (blkCnt > 0) {
298         uint32x4_t      vecIn;
299 
300         vecIn = (uint32x4_t)vldrwq_gather_offset_s32(pSrc, bitRevOffs0);
301         idxOffs0 = idxOffs0 + 32;
302         vst1q(pDst16, (uint16x8_t) vecIn);
303         pDst16 += 8;
304         bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
305 
306         vecIn = (uint32x4_t)vldrwq_gather_offset_s32(pSrc, bitRevOffs1);
307         idxOffs1 = idxOffs1 + 32;
308         vst1q(pDst16, (uint16x8_t) vecIn);
309         pDst16 += 8;
310         bitRevOffs1 = vbrsrq(idxOffs1, bitRevPos);
311 
312         blkCnt--;
313     }
314 }
315 
316 
317 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
318 
319 
320 #ifdef   __cplusplus
321 }
322 #endif
323 
324 
325 #endif /* _ARM_VEC_FFT_H_ */