1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cfft_radix8_f16.c
4 * Description: Radix-8 Decimation in Frequency CFFT & CIFFT Floating point processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions_f16.h"
30
31 #if defined(ARM_FLOAT16_SUPPORTED)
32
33 void arm_radix8_butterfly_f16(
34 float16_t * pSrc,
35 uint16_t fftLen,
36 const float16_t * pCoef,
37 uint16_t twidCoefModifier);
38
39 /* ----------------------------------------------------------------------
40 * Internal helper function used by the FFTs
41 * -------------------------------------------------------------------- */
42
43 /**
44 brief Core function for the floating-point CFFT butterfly process.
45 param[in,out] pSrc points to the in-place buffer of floating-point data type.
46 param[in] fftLen length of the FFT.
47 param[in] pCoef points to the twiddle coefficient buffer.
48 param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
49 return none
50 */
51
arm_radix8_butterfly_f16(float16_t * pSrc,uint16_t fftLen,const float16_t * pCoef,uint16_t twidCoefModifier)52 ARM_DSP_ATTRIBUTE void arm_radix8_butterfly_f16(
53 float16_t * pSrc,
54 uint16_t fftLen,
55 const float16_t * pCoef,
56 uint16_t twidCoefModifier)
57 {
58 uint32_t ia1, ia2, ia3, ia4, ia5, ia6, ia7;
59 uint32_t i1, i2, i3, i4, i5, i6, i7, i8;
60 uint32_t id;
61 uint32_t n1, n2, j;
62
63 float16_t r1, r2, r3, r4, r5, r6, r7, r8;
64 float16_t t1, t2;
65 float16_t s1, s2, s3, s4, s5, s6, s7, s8;
66 float16_t p1, p2, p3, p4;
67 float16_t co2, co3, co4, co5, co6, co7, co8;
68 float16_t si2, si3, si4, si5, si6, si7, si8;
69 const float16_t C81 = 0.70710678118f16;
70
71 n2 = fftLen;
72
73 do
74 {
75 n1 = n2;
76 n2 = n2 >> 3;
77 i1 = 0;
78
79 do
80 {
81 i2 = i1 + n2;
82 i3 = i2 + n2;
83 i4 = i3 + n2;
84 i5 = i4 + n2;
85 i6 = i5 + n2;
86 i7 = i6 + n2;
87 i8 = i7 + n2;
88 r1 = (_Float16)pSrc[2 * i1] + (_Float16)pSrc[2 * i5];
89 r5 = (_Float16)pSrc[2 * i1] - (_Float16)pSrc[2 * i5];
90 r2 = (_Float16)pSrc[2 * i2] + (_Float16)pSrc[2 * i6];
91 r6 = (_Float16)pSrc[2 * i2] - (_Float16)pSrc[2 * i6];
92 r3 = (_Float16)pSrc[2 * i3] + (_Float16)pSrc[2 * i7];
93 r7 = (_Float16)pSrc[2 * i3] - (_Float16)pSrc[2 * i7];
94 r4 = (_Float16)pSrc[2 * i4] + (_Float16)pSrc[2 * i8];
95 r8 = (_Float16)pSrc[2 * i4] - (_Float16)pSrc[2 * i8];
96 t1 = (_Float16)r1 - (_Float16)r3;
97 r1 = (_Float16)r1 + (_Float16)r3;
98 r3 = (_Float16)r2 - (_Float16)r4;
99 r2 = (_Float16)r2 + (_Float16)r4;
100 pSrc[2 * i1] = (_Float16)r1 + (_Float16)r2;
101 pSrc[2 * i5] = (_Float16)r1 - (_Float16)r2;
102 r1 = (_Float16)pSrc[2 * i1 + 1] + (_Float16)pSrc[2 * i5 + 1];
103 s5 = (_Float16)pSrc[2 * i1 + 1] - (_Float16)pSrc[2 * i5 + 1];
104 r2 = (_Float16)pSrc[2 * i2 + 1] + (_Float16)pSrc[2 * i6 + 1];
105 s6 = (_Float16)pSrc[2 * i2 + 1] - (_Float16)pSrc[2 * i6 + 1];
106 s3 = (_Float16)pSrc[2 * i3 + 1] + (_Float16)pSrc[2 * i7 + 1];
107 s7 = (_Float16)pSrc[2 * i3 + 1] - (_Float16)pSrc[2 * i7 + 1];
108 r4 = (_Float16)pSrc[2 * i4 + 1] + (_Float16)pSrc[2 * i8 + 1];
109 s8 = (_Float16)pSrc[2 * i4 + 1] - (_Float16)pSrc[2 * i8 + 1];
110 t2 = (_Float16)r1 - (_Float16)s3;
111 r1 = (_Float16)r1 + (_Float16)s3;
112 s3 = (_Float16)r2 - (_Float16)r4;
113 r2 = (_Float16)r2 + (_Float16)r4;
114 pSrc[2 * i1 + 1] = (_Float16)r1 + (_Float16)r2;
115 pSrc[2 * i5 + 1] = (_Float16)r1 - (_Float16)r2;
116 pSrc[2 * i3] = (_Float16)t1 + (_Float16)s3;
117 pSrc[2 * i7] = (_Float16)t1 - (_Float16)s3;
118 pSrc[2 * i3 + 1] = (_Float16)t2 - (_Float16)r3;
119 pSrc[2 * i7 + 1] = (_Float16)t2 + (_Float16)r3;
120 r1 = ((_Float16)r6 - (_Float16)r8) * (_Float16)C81;
121 r6 = ((_Float16)r6 + (_Float16)r8) * (_Float16)C81;
122 r2 = ((_Float16)s6 - (_Float16)s8) * (_Float16)C81;
123 s6 = ((_Float16)s6 + (_Float16)s8) * (_Float16)C81;
124 t1 = (_Float16)r5 - (_Float16)r1;
125 r5 = (_Float16)r5 + (_Float16)r1;
126 r8 = (_Float16)r7 - (_Float16)r6;
127 r7 = (_Float16)r7 + (_Float16)r6;
128 t2 = (_Float16)s5 - (_Float16)r2;
129 s5 = (_Float16)s5 + (_Float16)r2;
130 s8 = (_Float16)s7 - (_Float16)s6;
131 s7 = (_Float16)s7 + (_Float16)s6;
132 pSrc[2 * i2] = (_Float16)r5 + (_Float16)s7;
133 pSrc[2 * i8] = (_Float16)r5 - (_Float16)s7;
134 pSrc[2 * i6] = (_Float16)t1 + (_Float16)s8;
135 pSrc[2 * i4] = (_Float16)t1 - (_Float16)s8;
136 pSrc[2 * i2 + 1] = (_Float16)s5 - (_Float16)r7;
137 pSrc[2 * i8 + 1] = (_Float16)s5 + (_Float16)r7;
138 pSrc[2 * i6 + 1] = (_Float16)t2 - (_Float16)r8;
139 pSrc[2 * i4 + 1] = (_Float16)t2 + (_Float16)r8;
140
141 i1 += n1;
142 } while (i1 < fftLen);
143
144 if (n2 < 8)
145 break;
146
147 ia1 = 0;
148 j = 1;
149
150 do
151 {
152 /* index calculation for the coefficients */
153 id = ia1 + twidCoefModifier;
154 ia1 = id;
155 ia2 = ia1 + id;
156 ia3 = ia2 + id;
157 ia4 = ia3 + id;
158 ia5 = ia4 + id;
159 ia6 = ia5 + id;
160 ia7 = ia6 + id;
161
162 co2 = pCoef[2 * ia1];
163 co3 = pCoef[2 * ia2];
164 co4 = pCoef[2 * ia3];
165 co5 = pCoef[2 * ia4];
166 co6 = pCoef[2 * ia5];
167 co7 = pCoef[2 * ia6];
168 co8 = pCoef[2 * ia7];
169 si2 = pCoef[2 * ia1 + 1];
170 si3 = pCoef[2 * ia2 + 1];
171 si4 = pCoef[2 * ia3 + 1];
172 si5 = pCoef[2 * ia4 + 1];
173 si6 = pCoef[2 * ia5 + 1];
174 si7 = pCoef[2 * ia6 + 1];
175 si8 = pCoef[2 * ia7 + 1];
176
177 i1 = j;
178
179 do
180 {
181 /* index calculation for the input */
182 i2 = i1 + n2;
183 i3 = i2 + n2;
184 i4 = i3 + n2;
185 i5 = i4 + n2;
186 i6 = i5 + n2;
187 i7 = i6 + n2;
188 i8 = i7 + n2;
189 r1 = (_Float16)pSrc[2 * i1] + (_Float16)pSrc[2 * i5];
190 r5 = (_Float16)pSrc[2 * i1] - (_Float16)pSrc[2 * i5];
191 r2 = (_Float16)pSrc[2 * i2] + (_Float16)pSrc[2 * i6];
192 r6 = (_Float16)pSrc[2 * i2] - (_Float16)pSrc[2 * i6];
193 r3 = (_Float16)pSrc[2 * i3] + (_Float16)pSrc[2 * i7];
194 r7 = (_Float16)pSrc[2 * i3] - (_Float16)pSrc[2 * i7];
195 r4 = (_Float16)pSrc[2 * i4] + (_Float16)pSrc[2 * i8];
196 r8 = (_Float16)pSrc[2 * i4] - (_Float16)pSrc[2 * i8];
197 t1 = (_Float16)r1 - (_Float16)r3;
198 r1 = (_Float16)r1 + (_Float16)r3;
199 r3 = (_Float16)r2 - (_Float16)r4;
200 r2 = (_Float16)r2 + (_Float16)r4;
201 pSrc[2 * i1] = (_Float16)r1 + (_Float16)r2;
202 r2 = (_Float16)r1 - (_Float16)r2;
203 s1 = (_Float16)pSrc[2 * i1 + 1] + (_Float16)pSrc[2 * i5 + 1];
204 s5 = (_Float16)pSrc[2 * i1 + 1] - (_Float16)pSrc[2 * i5 + 1];
205 s2 = (_Float16)pSrc[2 * i2 + 1] + (_Float16)pSrc[2 * i6 + 1];
206 s6 = (_Float16)pSrc[2 * i2 + 1] - (_Float16)pSrc[2 * i6 + 1];
207 s3 = (_Float16)pSrc[2 * i3 + 1] + (_Float16)pSrc[2 * i7 + 1];
208 s7 = (_Float16)pSrc[2 * i3 + 1] - (_Float16)pSrc[2 * i7 + 1];
209 s4 = (_Float16)pSrc[2 * i4 + 1] + (_Float16)pSrc[2 * i8 + 1];
210 s8 = (_Float16)pSrc[2 * i4 + 1] - (_Float16)pSrc[2 * i8 + 1];
211 t2 = (_Float16)s1 - (_Float16)s3;
212 s1 = (_Float16)s1 + (_Float16)s3;
213 s3 = (_Float16)s2 - (_Float16)s4;
214 s2 = (_Float16)s2 + (_Float16)s4;
215 r1 = (_Float16)t1 + (_Float16)s3;
216 t1 = (_Float16)t1 - (_Float16)s3;
217 pSrc[2 * i1 + 1] = (_Float16)s1 + (_Float16)s2;
218 s2 = (_Float16)s1 - (_Float16)s2;
219 s1 = (_Float16)t2 - (_Float16)r3;
220 t2 = (_Float16)t2 + (_Float16)r3;
221 p1 = (_Float16)co5 * (_Float16)r2;
222 p2 = (_Float16)si5 * (_Float16)s2;
223 p3 = (_Float16)co5 * (_Float16)s2;
224 p4 = (_Float16)si5 * (_Float16)r2;
225 pSrc[2 * i5] = (_Float16)p1 + (_Float16)p2;
226 pSrc[2 * i5 + 1] = (_Float16)p3 - (_Float16)p4;
227 p1 = (_Float16)co3 * (_Float16)r1;
228 p2 = (_Float16)si3 * (_Float16)s1;
229 p3 = (_Float16)co3 * (_Float16)s1;
230 p4 = (_Float16)si3 * (_Float16)r1;
231 pSrc[2 * i3] = (_Float16)p1 + (_Float16)p2;
232 pSrc[2 * i3 + 1] = (_Float16)p3 - (_Float16)p4;
233 p1 = (_Float16)co7 * (_Float16)t1;
234 p2 = (_Float16)si7 * (_Float16)t2;
235 p3 = (_Float16)co7 * (_Float16)t2;
236 p4 = (_Float16)si7 * (_Float16)t1;
237 pSrc[2 * i7] = (_Float16)p1 + (_Float16)p2;
238 pSrc[2 * i7 + 1] = (_Float16)p3 - (_Float16)p4;
239 r1 = ((_Float16)r6 - (_Float16)r8) * (_Float16)C81;
240 r6 = ((_Float16)r6 + (_Float16)r8) * (_Float16)C81;
241 s1 = ((_Float16)s6 - (_Float16)s8) * (_Float16)C81;
242 s6 = ((_Float16)s6 + (_Float16)s8) * (_Float16)C81;
243 t1 = (_Float16)r5 - (_Float16)r1;
244 r5 = (_Float16)r5 + (_Float16)r1;
245 r8 = (_Float16)r7 - (_Float16)r6;
246 r7 = (_Float16)r7 + (_Float16)r6;
247 t2 = (_Float16)s5 - (_Float16)s1;
248 s5 = (_Float16)s5 + (_Float16)s1;
249 s8 = (_Float16)s7 - (_Float16)s6;
250 s7 = (_Float16)s7 + (_Float16)s6;
251 r1 = (_Float16)r5 + (_Float16)s7;
252 r5 = (_Float16)r5 - (_Float16)s7;
253 r6 = (_Float16)t1 + (_Float16)s8;
254 t1 = (_Float16)t1 - (_Float16)s8;
255 s1 = (_Float16)s5 - (_Float16)r7;
256 s5 = (_Float16)s5 + (_Float16)r7;
257 s6 = (_Float16)t2 - (_Float16)r8;
258 t2 = (_Float16)t2 + (_Float16)r8;
259 p1 = (_Float16)co2 * (_Float16)r1;
260 p2 = (_Float16)si2 * (_Float16)s1;
261 p3 = (_Float16)co2 * (_Float16)s1;
262 p4 = (_Float16)si2 * (_Float16)r1;
263 pSrc[2 * i2] = (_Float16)p1 + (_Float16)p2;
264 pSrc[2 * i2 + 1] = (_Float16)p3 - (_Float16)p4;
265 p1 = (_Float16)co8 * (_Float16)r5;
266 p2 = (_Float16)si8 * (_Float16)s5;
267 p3 = (_Float16)co8 * (_Float16)s5;
268 p4 = (_Float16)si8 * (_Float16)r5;
269 pSrc[2 * i8] = (_Float16)p1 + (_Float16)p2;
270 pSrc[2 * i8 + 1] = (_Float16)p3 - (_Float16)p4;
271 p1 = (_Float16)co6 * (_Float16)r6;
272 p2 = (_Float16)si6 * (_Float16)s6;
273 p3 = (_Float16)co6 * (_Float16)s6;
274 p4 = (_Float16)si6 * (_Float16)r6;
275 pSrc[2 * i6] = (_Float16)p1 + (_Float16)p2;
276 pSrc[2 * i6 + 1] = (_Float16)p3 - (_Float16)p4;
277 p1 = (_Float16)co4 * (_Float16)t1;
278 p2 = (_Float16)si4 * (_Float16)t2;
279 p3 = (_Float16)co4 * (_Float16)t2;
280 p4 = (_Float16)si4 * (_Float16)t1;
281 pSrc[2 * i4] = (_Float16)p1 + (_Float16)p2;
282 pSrc[2 * i4 + 1] = (_Float16)p3 - (_Float16)p4;
283
284 i1 += n1;
285 } while (i1 < fftLen);
286
287 j++;
288 } while (j < n2);
289
290 twidCoefModifier <<= 3;
291 } while (n2 > 7);
292 }
293
294 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
295