1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_rfft_fast_f16.c
4 * Description: RFFT & RIFFT Floating point process function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions_f16.h"
30 #include "arm_common_tables_f16.h"
31
32 #if defined(ARM_FLOAT16_SUPPORTED)
33
34
35 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
36
stage_rfft_f16(const arm_rfft_fast_instance_f16 * S,const float16_t * p,float16_t * pOut)37 static void stage_rfft_f16(
38 const arm_rfft_fast_instance_f16 * S,
39 const float16_t * p,
40 float16_t * pOut)
41 {
42 int32_t k; /* Loop Counter */
43 float16_t twR, twI; /* RFFT Twiddle coefficients */
44 const float16_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
45 const float16_t *pA = p; /* increasing pointer */
46 const float16_t *pB = p; /* decreasing pointer */
47 float16_t xAR, xAI, xBR, xBI; /* temporary variables */
48 float16_t t1a, t1b; /* temporary variables */
49 float16_t p0, p1, p2, p3; /* temporary variables */
50
51 float16x8x2_t tw,xA,xB;
52 float16x8x2_t tmp1, tmp2, res;
53
54 uint16x8_t vecStridesBkwd;
55
56 vecStridesBkwd = vddupq_u16((uint16_t)14, 2);
57
58
59 int blockCnt;
60
61
62 k = (S->Sint).fftLen - 1;
63
64 /* Pack first and last sample of the frequency domain together */
65
66 xBR = pB[0];
67 xBI = pB[1];
68 xAR = pA[0];
69 xAI = pA[1];
70
71 twR = *pCoeff++ ;
72 twI = *pCoeff++ ;
73
74 // U1 = XA(1) + XB(1); % It is real
75 t1a = (_Float16)xBR + (_Float16)xAR ;
76
77 // U2 = XB(1) - XA(1); % It is imaginary
78 t1b = (_Float16)xBI + (_Float16)xAI ;
79
80 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
81 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
82 *pOut++ = 0.5f16 * ( (_Float16)t1a + (_Float16)t1b );
83 *pOut++ = 0.5f16 * ( (_Float16)t1a - (_Float16)t1b );
84
85 // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
86 pB = p + 2*k - 14;
87 pA += 2;
88
89 blockCnt = k >> 3;
90 while (blockCnt > 0)
91 {
92 /*
93 function X = my_split_rfft(X, ifftFlag)
94 % X is a series of real numbers
95 L = length(X);
96 XC = X(1:2:end) +i*X(2:2:end);
97 XA = fft(XC);
98 XB = conj(XA([1 end:-1:2]));
99 TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
100 for l = 2:L/2
101 XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
102 end
103 XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
104 X = XA;
105 */
106
107
108 xA = vld2q_f16(pA);
109 pA += 16;
110
111 xB = vld2q_f16(pB);
112
113 xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd);
114 xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd);
115
116 xB.val[1] = vnegq_f16(xB.val[1]);
117 pB -= 16;
118
119
120 tw = vld2q_f16(pCoeff);
121 pCoeff += 16;
122
123
124 tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]);
125 tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]);
126
127 tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]);
128 tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]);
129
130 res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
131 res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
132
133 res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
134 res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
135
136 res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] );
137 res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] );
138
139 res.val[0] = vmulq_n_f16(res.val[0], 0.5f);
140 res.val[1] = vmulq_n_f16(res.val[1], 0.5f);
141
142
143 vst2q_f16(pOut, res);
144 pOut += 16;
145
146
147 blockCnt--;
148 }
149
150 pB += 14;
151 blockCnt = k & 7;
152 while (blockCnt > 0)
153 {
154 /*
155 function X = my_split_rfft(X, ifftFlag)
156 % X is a series of real numbers
157 L = length(X);
158 XC = X(1:2:end) +i*X(2:2:end);
159 XA = fft(XC);
160 XB = conj(XA([1 end:-1:2]));
161 TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
162 for l = 2:L/2
163 XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
164 end
165 XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
166 X = XA;
167 */
168
169 xBI = pB[1];
170 xBR = pB[0];
171 xAR = pA[0];
172 xAI = pA[1];
173
174 twR = *pCoeff++;
175 twI = *pCoeff++;
176
177 t1a = (_Float16)xBR - (_Float16)xAR ;
178 t1b = (_Float16)xBI + (_Float16)xAI ;
179
180 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
181 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
182 p0 = (_Float16)twR * (_Float16)t1a;
183 p1 = (_Float16)twI * (_Float16)t1a;
184 p2 = (_Float16)twR * (_Float16)t1b;
185 p3 = (_Float16)twI * (_Float16)t1b;
186
187 *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR + (_Float16)p0 + (_Float16)p3 ); //xAR
188 *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)p1 - (_Float16)p2 ); //xAI
189
190 pA += 2;
191 pB -= 2;
192 blockCnt--;
193 }
194 }
195
196 /* Prepares data for inverse cfft */
merge_rfft_f16(const arm_rfft_fast_instance_f16 * S,const float16_t * p,float16_t * pOut)197 static void merge_rfft_f16(
198 const arm_rfft_fast_instance_f16 * S,
199 const float16_t * p,
200 float16_t * pOut)
201 {
202 int32_t k; /* Loop Counter */
203 float16_t twR, twI; /* RFFT Twiddle coefficients */
204 const float16_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
205 const float16_t *pA = p; /* increasing pointer */
206 const float16_t *pB = p; /* decreasing pointer */
207 float16_t xAR, xAI, xBR, xBI; /* temporary variables */
208 float16_t t1a, t1b, r, s, t, u; /* temporary variables */
209
210 float16x8x2_t tw,xA,xB;
211 float16x8x2_t tmp1, tmp2, res;
212 uint16x8_t vecStridesBkwd;
213
214 vecStridesBkwd = vddupq_u16((uint16_t)14, 2);
215
216 int blockCnt;
217
218
219 k = (S->Sint).fftLen - 1;
220
221 xAR = pA[0];
222 xAI = pA[1];
223
224 pCoeff += 2 ;
225
226 *pOut++ = 0.5f16 * ( (_Float16)xAR + (_Float16)xAI );
227 *pOut++ = 0.5f16 * ( (_Float16)xAR - (_Float16)xAI );
228
229 pB = p + 2*k - 14;
230 pA += 2 ;
231
232 blockCnt = k >> 3;
233 while (blockCnt > 0)
234 {
235 /* G is half of the frequency complex spectrum */
236 //for k = 2:N
237 // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
238 xA = vld2q_f16(pA);
239 pA += 16;
240
241 xB = vld2q_f16(pB);
242
243 xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd);
244 xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd);
245
246 xB.val[1] = vnegq_f16(xB.val[1]);
247 pB -= 16;
248
249
250 tw = vld2q_f16(pCoeff);
251 tw.val[1] = vnegq_f16(tw.val[1]);
252 pCoeff += 16;
253
254
255 tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]);
256 tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]);
257
258 tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]);
259 tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]);
260
261 res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
262 res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
263
264 res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
265 res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
266
267 res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] );
268 res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] );
269
270 res.val[0] = vmulq_n_f16(res.val[0], 0.5f);
271 res.val[1] = vmulq_n_f16(res.val[1], 0.5f);
272
273
274 vst2q_f16(pOut, res);
275 pOut += 16;
276
277
278 blockCnt--;
279 }
280
281 pB += 14;
282 blockCnt = k & 7;
283 while (blockCnt > 0)
284 {
285 /* G is half of the frequency complex spectrum */
286 //for k = 2:N
287 // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
288 xBI = pB[1] ;
289 xBR = pB[0] ;
290 xAR = pA[0];
291 xAI = pA[1];
292
293 twR = *pCoeff++;
294 twI = *pCoeff++;
295
296 t1a = (_Float16)xAR - (_Float16)xBR ;
297 t1b = (_Float16)xAI + (_Float16)xBI ;
298
299 r = (_Float16)twR * (_Float16)t1a;
300 s = (_Float16)twI * (_Float16)t1b;
301 t = (_Float16)twI * (_Float16)t1a;
302 u = (_Float16)twR * (_Float16)t1b;
303
304 // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
305 // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
306 *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR - (_Float16)r - (_Float16)s ); //xAR
307 *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)t - (_Float16)u ); //xAI
308
309 pA += 2;
310 pB -= 2;
311 blockCnt--;
312 }
313
314 }
315 #else
stage_rfft_f16(const arm_rfft_fast_instance_f16 * S,const float16_t * p,float16_t * pOut)316 static void stage_rfft_f16(
317 const arm_rfft_fast_instance_f16 * S,
318 const float16_t * p,
319 float16_t * pOut)
320 {
321 int32_t k; /* Loop Counter */
322 float16_t twR, twI; /* RFFT Twiddle coefficients */
323 const float16_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
324 const float16_t *pA = p; /* increasing pointer */
325 const float16_t *pB = p; /* decreasing pointer */
326 float16_t xAR, xAI, xBR, xBI; /* temporary variables */
327 float16_t t1a, t1b; /* temporary variables */
328 float16_t p0, p1, p2, p3; /* temporary variables */
329
330
331 k = (S->Sint).fftLen - 1;
332
333 /* Pack first and last sample of the frequency domain together */
334
335 xBR = pB[0];
336 xBI = pB[1];
337 xAR = pA[0];
338 xAI = pA[1];
339
340 twR = *pCoeff++ ;
341 twI = *pCoeff++ ;
342
343
344 // U1 = XA(1) + XB(1); % It is real
345 t1a = (_Float16)xBR + (_Float16)xAR ;
346
347 // U2 = XB(1) - XA(1); % It is imaginary
348 t1b = (_Float16)xBI + (_Float16)xAI ;
349
350 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
351 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
352 *pOut++ = 0.5f16 * ( (_Float16)t1a + (_Float16)t1b );
353 *pOut++ = 0.5f16 * ( (_Float16)t1a - (_Float16)t1b );
354
355 // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
356 pB = p + 2*k;
357 pA += 2;
358
359 do
360 {
361 /*
362 function X = my_split_rfft(X, ifftFlag)
363 % X is a series of real numbers
364 L = length(X);
365 XC = X(1:2:end) +i*X(2:2:end);
366 XA = fft(XC);
367 XB = conj(XA([1 end:-1:2]));
368 TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
369 for l = 2:L/2
370 XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
371 end
372 XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
373 X = XA;
374 */
375
376 xBI = pB[1];
377 xBR = pB[0];
378 xAR = pA[0];
379 xAI = pA[1];
380
381 twR = *pCoeff++;
382 twI = *pCoeff++;
383
384 t1a = (_Float16)xBR - (_Float16)xAR ;
385 t1b = (_Float16)xBI + (_Float16)xAI ;
386
387 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
388 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
389 p0 = (_Float16)twR * (_Float16)t1a;
390 p1 = (_Float16)twI * (_Float16)t1a;
391 p2 = (_Float16)twR * (_Float16)t1b;
392 p3 = (_Float16)twI * (_Float16)t1b;
393
394 *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR + (_Float16)p0 + (_Float16)p3 ); //xAR
395 *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)p1 - (_Float16)p2 ); //xAI
396
397
398 pA += 2;
399 pB -= 2;
400 k--;
401 } while (k > 0);
402 }
403
404 /* Prepares data for inverse cfft */
merge_rfft_f16(const arm_rfft_fast_instance_f16 * S,const float16_t * p,float16_t * pOut)405 static void merge_rfft_f16(
406 const arm_rfft_fast_instance_f16 * S,
407 const float16_t * p,
408 float16_t * pOut)
409 {
410 int32_t k; /* Loop Counter */
411 float16_t twR, twI; /* RFFT Twiddle coefficients */
412 const float16_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
413 const float16_t *pA = p; /* increasing pointer */
414 const float16_t *pB = p; /* decreasing pointer */
415 float16_t xAR, xAI, xBR, xBI; /* temporary variables */
416 float16_t t1a, t1b, r, s, t, u; /* temporary variables */
417
418 k = (S->Sint).fftLen - 1;
419
420 xAR = pA[0];
421 xAI = pA[1];
422
423 pCoeff += 2 ;
424
425 *pOut++ = 0.5f16 * ( (_Float16)xAR + (_Float16)xAI );
426 *pOut++ = 0.5f16 * ( (_Float16)xAR - (_Float16)xAI );
427
428 pB = p + 2*k ;
429 pA += 2 ;
430
431 while (k > 0)
432 {
433 /* G is half of the frequency complex spectrum */
434 //for k = 2:N
435 // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
436 xBI = pB[1] ;
437 xBR = pB[0] ;
438 xAR = pA[0];
439 xAI = pA[1];
440
441 twR = *pCoeff++;
442 twI = *pCoeff++;
443
444 t1a = (_Float16)xAR - (_Float16)xBR ;
445 t1b = (_Float16)xAI + (_Float16)xBI ;
446
447 r = (_Float16)twR * (_Float16)t1a;
448 s = (_Float16)twI * (_Float16)t1b;
449 t = (_Float16)twI * (_Float16)t1a;
450 u = (_Float16)twR * (_Float16)t1b;
451
452 // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
453 // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
454 *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR - (_Float16)r - (_Float16)s ); //xAR
455 *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)t - (_Float16)u ); //xAI
456
457 pA += 2;
458 pB -= 2;
459 k--;
460 }
461
462 }
463
464 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
465
466 /**
467 @ingroup RealFFT
468 */
469
470 /**
471 @defgroup RealFFTF16 Real FFT F16 Functions
472 */
473
474 /**
475 @addtogroup RealFFTF16
476 @{
477 */
478
479 /**
480 @brief Processing function for the floating-point real FFT.
481 @param[in] S points to an arm_rfft_fast_instance_f16 structure
482 @param[in] p points to input buffer (Source buffer is modified by this function.)
483 @param[in] pOut points to output buffer
484 @param[in] ifftFlag
485 - value = 0: RFFT
486 - value = 1: RIFFT
487 */
488
arm_rfft_fast_f16(const arm_rfft_fast_instance_f16 * S,float16_t * p,float16_t * pOut,uint8_t ifftFlag)489 ARM_DSP_ATTRIBUTE void arm_rfft_fast_f16(
490 const arm_rfft_fast_instance_f16 * S,
491 float16_t * p,
492 float16_t * pOut,
493 uint8_t ifftFlag)
494 {
495 const arm_cfft_instance_f16 * Sint = &(S->Sint);
496
497
498 /* Calculation of Real FFT */
499 if (ifftFlag)
500 {
501 /* Real FFT compression */
502 merge_rfft_f16(S, p, pOut);
503 /* Complex radix-4 IFFT process */
504 arm_cfft_f16( Sint, pOut, ifftFlag, 1);
505 }
506 else
507 {
508
509 /* Calculation of RFFT of input */
510 arm_cfft_f16( Sint, p, ifftFlag, 1);
511
512 /* Real FFT extraction */
513 stage_rfft_f16(S, p, pOut);
514 }
515 }
516
517 /**
518 * @} end of RealFFTF16 group
519 */
520
521 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
522