1 /******************************************************************************
2  *
3  *  Copyright 2022 Google LLC
4  *
5  *  Licensed under the Apache License, Version 2.0 (the "License");
6  *  you may not use this file except in compliance with the License.
7  *  You may obtain a copy of the License at:
8  *
9  *  http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  *
17  ******************************************************************************/
18 
19 #if __ARM_NEON && __ARM_ARCH_ISA_A64 && \
20         !defined(TEST_ARM) || defined(TEST_NEON)
21 
22 #ifndef TEST_NEON
23 #include <arm_neon.h>
24 #endif /* TEST_NEON */
25 
26 
27 /**
28  * FFT 5 Points
29  * The number of interleaved transform `n` assumed to be even
30  */
31 #ifndef fft_5
32 
neon_fft_5(const struct lc3_complex * x,struct lc3_complex * y,int n)33 LC3_HOT static inline void neon_fft_5(
34     const struct lc3_complex *x, struct lc3_complex *y, int n)
35 {
36     static const union { float f[2]; uint64_t u64; }
37         __cos1 = { {  0.3090169944,  0.3090169944 } },
38         __cos2 = { { -0.8090169944, -0.8090169944 } },
39         __sin1 = { {  0.9510565163, -0.9510565163 } },
40         __sin2 = { {  0.5877852523, -0.5877852523 } };
41 
42     float32x2_t sin1 = vcreate_f32(__sin1.u64);
43     float32x2_t sin2 = vcreate_f32(__sin2.u64);
44     float32x2_t cos1 = vcreate_f32(__cos1.u64);
45     float32x2_t cos2 = vcreate_f32(__cos2.u64);
46 
47     float32x4_t sin1q = vcombine_f32(sin1, sin1);
48     float32x4_t sin2q = vcombine_f32(sin2, sin2);
49     float32x4_t cos1q = vcombine_f32(cos1, cos1);
50     float32x4_t cos2q = vcombine_f32(cos2, cos2);
51 
52     for (int i = 0; i < n; i += 2, x += 2, y += 10) {
53 
54         float32x4_t y0, y1, y2, y3, y4;
55 
56         float32x4_t x0 = vld1q_f32( (float *)(x + 0*n) );
57         float32x4_t x1 = vld1q_f32( (float *)(x + 1*n) );
58         float32x4_t x2 = vld1q_f32( (float *)(x + 2*n) );
59         float32x4_t x3 = vld1q_f32( (float *)(x + 3*n) );
60         float32x4_t x4 = vld1q_f32( (float *)(x + 4*n) );
61 
62         float32x4_t s14 = vaddq_f32(x1, x4);
63         float32x4_t s23 = vaddq_f32(x2, x3);
64 
65         float32x4_t d14 = vrev64q_f32( vsubq_f32(x1, x4) );
66         float32x4_t d23 = vrev64q_f32( vsubq_f32(x2, x3) );
67 
68         y0 = vaddq_f32( x0, vaddq_f32(s14, s23) );
69 
70         y4 = vfmaq_f32( x0, s14, cos1q );
71         y4 = vfmaq_f32( y4, s23, cos2q );
72 
73         y1 = vfmaq_f32( y4, d14, sin1q );
74         y1 = vfmaq_f32( y1, d23, sin2q );
75 
76         y4 = vfmsq_f32( y4, d14, sin1q );
77         y4 = vfmsq_f32( y4, d23, sin2q );
78 
79         y3 = vfmaq_f32( x0, s14, cos2q );
80         y3 = vfmaq_f32( y3, s23, cos1q );
81 
82         y2 = vfmaq_f32( y3, d14, sin2q );
83         y2 = vfmsq_f32( y2, d23, sin1q );
84 
85         y3 = vfmsq_f32( y3, d14, sin2q );
86         y3 = vfmaq_f32( y3, d23, sin1q );
87 
88         vst1_f32( (float *)(y + 0), vget_low_f32(y0) );
89         vst1_f32( (float *)(y + 1), vget_low_f32(y1) );
90         vst1_f32( (float *)(y + 2), vget_low_f32(y2) );
91         vst1_f32( (float *)(y + 3), vget_low_f32(y3) );
92         vst1_f32( (float *)(y + 4), vget_low_f32(y4) );
93 
94         vst1_f32( (float *)(y + 5), vget_high_f32(y0) );
95         vst1_f32( (float *)(y + 6), vget_high_f32(y1) );
96         vst1_f32( (float *)(y + 7), vget_high_f32(y2) );
97         vst1_f32( (float *)(y + 8), vget_high_f32(y3) );
98         vst1_f32( (float *)(y + 9), vget_high_f32(y4) );
99     }
100 }
101 
102 #ifndef TEST_NEON
103 #define fft_5 neon_fft_5
104 #endif
105 
106 #endif /* fft_5 */
107 
108 /**
109  * FFT Butterfly 3 Points
110  */
111 #ifndef fft_bf3
112 
neon_fft_bf3(const struct lc3_fft_bf3_twiddles * twiddles,const struct lc3_complex * x,struct lc3_complex * y,int n)113 LC3_HOT static inline void neon_fft_bf3(
114     const struct lc3_fft_bf3_twiddles *twiddles,
115     const struct lc3_complex *x, struct lc3_complex *y, int n)
116 {
117     int n3 = twiddles->n3;
118     const struct lc3_complex (*w0_ptr)[2] = twiddles->t;
119     const struct lc3_complex (*w1_ptr)[2] = w0_ptr + n3;
120     const struct lc3_complex (*w2_ptr)[2] = w1_ptr + n3;
121 
122     const struct lc3_complex *x0_ptr = x;
123     const struct lc3_complex *x1_ptr = x0_ptr + n*n3;
124     const struct lc3_complex *x2_ptr = x1_ptr + n*n3;
125 
126     struct lc3_complex *y0_ptr = y;
127     struct lc3_complex *y1_ptr = y0_ptr + n3;
128     struct lc3_complex *y2_ptr = y1_ptr + n3;
129 
130     for (int j, i = 0; i < n; i++,
131             y0_ptr += 3*n3, y1_ptr += 3*n3, y2_ptr += 3*n3) {
132 
133         /* --- Process by pair --- */
134 
135         for (j = 0; j < (n3 >> 1); j++,
136                 x0_ptr += 2, x1_ptr += 2, x2_ptr += 2) {
137 
138             float32x4_t x0 = vld1q_f32( (float *)x0_ptr );
139             float32x4_t x1 = vld1q_f32( (float *)x1_ptr );
140             float32x4_t x2 = vld1q_f32( (float *)x2_ptr );
141 
142             float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 );
143             float32x4_t x2r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x2)), x2 );
144 
145             float32x4x2_t wn;
146             float32x4_t yn;
147 
148             wn = vld2q_f32( (float *)(w0_ptr + 2*j) );
149 
150             yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
151             yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
152             yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
153             yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
154             vst1q_f32( (float *)(y0_ptr + 2*j), yn );
155 
156             wn = vld2q_f32( (float *)(w1_ptr + 2*j) );
157 
158             yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
159             yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
160             yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
161             yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
162             vst1q_f32( (float *)(y1_ptr + 2*j), yn );
163 
164             wn = vld2q_f32( (float *)(w2_ptr + 2*j) );
165 
166             yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
167             yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
168             yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
169             yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
170             vst1q_f32( (float *)(y2_ptr + 2*j), yn );
171 
172         }
173 
174         /* --- Last iteration --- */
175 
176         if (n3 & 1) {
177 
178             float32x2x2_t wn;
179             float32x2_t yn;
180 
181             float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) );
182             float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) );
183             float32x2_t x2 = vld1_f32( (float *)(x2_ptr++) );
184 
185             float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 );
186             float32x2_t x2r = vtrn1_f32( vrev64_f32(vneg_f32(x2)), x2 );
187 
188             wn = vld2_f32( (float *)(w0_ptr + 2*j) );
189 
190             yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
191             yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
192             yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
193             yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
194             vst1_f32( (float *)(y0_ptr + 2*j), yn );
195 
196             wn = vld2_f32( (float *)(w1_ptr + 2*j) );
197 
198             yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
199             yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
200             yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
201             yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
202             vst1_f32( (float *)(y1_ptr + 2*j), yn );
203 
204             wn = vld2_f32( (float *)(w2_ptr + 2*j) );
205 
206             yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
207             yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
208             yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
209             yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
210             vst1_f32( (float *)(y2_ptr + 2*j), yn );
211         }
212 
213     }
214 }
215 
216 #ifndef TEST_NEON
217 #define fft_bf3 neon_fft_bf3
218 #endif
219 
220 #endif /* fft_bf3 */
221 
222 /**
223  * FFT Butterfly 2 Points
224  */
225 #ifndef fft_bf2
226 
neon_fft_bf2(const struct lc3_fft_bf2_twiddles * twiddles,const struct lc3_complex * x,struct lc3_complex * y,int n)227 LC3_HOT static inline void neon_fft_bf2(
228     const struct lc3_fft_bf2_twiddles *twiddles,
229     const struct lc3_complex *x, struct lc3_complex *y, int n)
230 {
231     int n2 = twiddles->n2;
232     const struct lc3_complex *w_ptr = twiddles->t;
233 
234     const struct lc3_complex *x0_ptr = x;
235     const struct lc3_complex *x1_ptr = x0_ptr + n*n2;
236 
237     struct lc3_complex *y0_ptr = y;
238     struct lc3_complex *y1_ptr = y0_ptr + n2;
239 
240     for (int j, i = 0; i < n; i++, y0_ptr += 2*n2, y1_ptr += 2*n2) {
241 
242         /* --- Process by pair --- */
243 
244         for (j = 0; j < (n2 >> 1); j++, x0_ptr += 2, x1_ptr += 2) {
245 
246             float32x4_t x0 = vld1q_f32( (float *)x0_ptr );
247             float32x4_t x1 = vld1q_f32( (float *)x1_ptr );
248             float32x4_t y0, y1;
249 
250             float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 );
251 
252             float32x4_t w = vld1q_f32( (float *)(w_ptr + 2*j) );
253             float32x4_t w_re = vtrn1q_f32(w, w);
254             float32x4_t w_im = vtrn2q_f32(w, w);
255 
256             y0 = vfmaq_f32( x0, x1 , w_re );
257             y0 = vfmaq_f32( y0, x1r, w_im );
258             vst1q_f32( (float *)(y0_ptr + 2*j), y0 );
259 
260             y1 = vfmsq_f32( x0, x1 , w_re );
261             y1 = vfmsq_f32( y1, x1r, w_im );
262             vst1q_f32( (float *)(y1_ptr + 2*j), y1 );
263         }
264 
265         /* --- Last iteration --- */
266 
267         if (n2 & 1) {
268 
269             float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) );
270             float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) );
271             float32x2_t y0, y1;
272 
273             float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 );
274 
275             float32x2_t w = vld1_f32( (float *)(w_ptr + 2*j) );
276             float32x2_t w_re = vtrn1_f32(w, w);
277             float32x2_t w_im = vtrn2_f32(w, w);
278 
279             y0 = vfma_f32( x0, x1 , w_re );
280             y0 = vfma_f32( y0, x1r, w_im );
281             vst1_f32( (float *)(y0_ptr + 2*j), y0 );
282 
283             y1 = vfms_f32( x0, x1 , w_re );
284             y1 = vfms_f32( y1, x1r, w_im );
285             vst1_f32( (float *)(y1_ptr + 2*j), y1 );
286         }
287     }
288 }
289 
290 #ifndef TEST_NEON
291 #define fft_bf2 neon_fft_bf2
292 #endif
293 
294 #endif /* fft_bf2 */
295 
296 #endif /* __ARM_NEON && __ARM_ARCH_ISA_A64 */
297