1 /*
2 Copyright (c) 2003-2010, Mark Borgerding
3 
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
7 
8     * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
9     * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10     * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 
12 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13 */
14 
15 
16 #include "_kiss_fft_guts.h"
17 /* The guts header contains all the multiplication and addition macros that are defined for
18  fixed or floating point complex numbers.  It also delares the kf_ internal functions.
19  */
20 
21 // CHRE modifications begin
22 #if defined(__clang__)
23 #pragma clang diagnostic push
24 #pragma clang diagnostic ignored "-Wcast-align"
25 #pragma clang diagnostic ignored "-Wbad-function-cast"
26 #endif
27 // CHRE modifications end
28 
kf_bfly2(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_cfg st,int m)29 static void kf_bfly2(
30         kiss_fft_cpx * Fout,
31         const size_t fstride,
32         const kiss_fft_cfg st,
33         int m
34         )
35 {
36     kiss_fft_cpx * Fout2;
37     kiss_fft_cpx * tw1 = st->twiddles;
38     kiss_fft_cpx t;
39     Fout2 = Fout + m;
40     do{
41         C_FIXDIV(*Fout,2); C_FIXDIV(*Fout2,2);
42 
43         C_MUL (t,  *Fout2 , *tw1);
44         tw1 += fstride;
45         C_SUB( *Fout2 ,  *Fout , t );
46         C_ADDTO( *Fout ,  t );
47         ++Fout2;
48         ++Fout;
49     }while (--m);
50 }
51 
kf_bfly4(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_cfg st,const size_t m)52 static void kf_bfly4(
53         kiss_fft_cpx * Fout,
54         const size_t fstride,
55         const kiss_fft_cfg st,
56         const size_t m
57         )
58 {
59     kiss_fft_cpx *tw1,*tw2,*tw3;
60     kiss_fft_cpx scratch[6];
61     size_t k=m;
62     const size_t m2=2*m;
63     const size_t m3=3*m;
64 
65 
66     tw3 = tw2 = tw1 = st->twiddles;
67 
68     do {
69         C_FIXDIV(*Fout,4); C_FIXDIV(Fout[m],4); C_FIXDIV(Fout[m2],4); C_FIXDIV(Fout[m3],4);
70 
71         C_MUL(scratch[0],Fout[m] , *tw1 );
72         C_MUL(scratch[1],Fout[m2] , *tw2 );
73         C_MUL(scratch[2],Fout[m3] , *tw3 );
74 
75         C_SUB( scratch[5] , *Fout, scratch[1] );
76         C_ADDTO(*Fout, scratch[1]);
77         C_ADD( scratch[3] , scratch[0] , scratch[2] );
78         C_SUB( scratch[4] , scratch[0] , scratch[2] );
79         C_SUB( Fout[m2], *Fout, scratch[3] );
80         tw1 += fstride;
81         tw2 += fstride*2;
82         tw3 += fstride*3;
83         C_ADDTO( *Fout , scratch[3] );
84 
85         if(st->inverse) {
86             Fout[m].r = scratch[5].r - scratch[4].i;
87             Fout[m].i = scratch[5].i + scratch[4].r;
88             Fout[m3].r = scratch[5].r + scratch[4].i;
89             Fout[m3].i = scratch[5].i - scratch[4].r;
90         }else{
91             Fout[m].r = scratch[5].r + scratch[4].i;
92             Fout[m].i = scratch[5].i - scratch[4].r;
93             Fout[m3].r = scratch[5].r - scratch[4].i;
94             Fout[m3].i = scratch[5].i + scratch[4].r;
95         }
96         ++Fout;
97     }while(--k);
98 }
99 
kf_bfly3(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_cfg st,size_t m)100 static void kf_bfly3(
101          kiss_fft_cpx * Fout,
102          const size_t fstride,
103          const kiss_fft_cfg st,
104          size_t m
105          )
106 {
107      size_t k=m;
108      const size_t m2 = 2*m;
109      kiss_fft_cpx *tw1,*tw2;
110      kiss_fft_cpx scratch[5];
111      kiss_fft_cpx epi3;
112      epi3 = st->twiddles[fstride*m];
113 
114      tw1=tw2=st->twiddles;
115 
116      do{
117          C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);
118 
119          C_MUL(scratch[1],Fout[m] , *tw1);
120          C_MUL(scratch[2],Fout[m2] , *tw2);
121 
122          C_ADD(scratch[3],scratch[1],scratch[2]);
123          C_SUB(scratch[0],scratch[1],scratch[2]);
124          tw1 += fstride;
125          tw2 += fstride*2;
126 
127          Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
128          Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
129 
130          C_MULBYSCALAR( scratch[0] , epi3.i );
131 
132          C_ADDTO(*Fout,scratch[3]);
133 
134          Fout[m2].r = Fout[m].r + scratch[0].i;
135          Fout[m2].i = Fout[m].i - scratch[0].r;
136 
137          Fout[m].r -= scratch[0].i;
138          Fout[m].i += scratch[0].r;
139 
140          ++Fout;
141      }while(--k);
142 }
143 
kf_bfly5(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_cfg st,int m)144 static void kf_bfly5(
145         kiss_fft_cpx * Fout,
146         const size_t fstride,
147         const kiss_fft_cfg st,
148         int m
149         )
150 {
151     kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
152     size_t u;
153     kiss_fft_cpx scratch[13];
154     kiss_fft_cpx * twiddles = st->twiddles;
155     kiss_fft_cpx *tw;
156     kiss_fft_cpx ya,yb;
157     ya = twiddles[fstride*(size_t)m];
158     yb = twiddles[fstride*2*(size_t)m];
159 
160     Fout0=Fout;
161     Fout1=Fout0+m;
162     Fout2=Fout0+2*m;
163     Fout3=Fout0+3*m;
164     Fout4=Fout0+4*m;
165 
166     tw=st->twiddles;
167     for ( u=0; u<(size_t)m; ++u ) {
168         C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
169         scratch[0] = *Fout0;
170 
171         C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
172         C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
173         C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
174         C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
175 
176         C_ADD( scratch[7],scratch[1],scratch[4]);
177         C_SUB( scratch[10],scratch[1],scratch[4]);
178         C_ADD( scratch[8],scratch[2],scratch[3]);
179         C_SUB( scratch[9],scratch[2],scratch[3]);
180 
181         Fout0->r += scratch[7].r + scratch[8].r;
182         Fout0->i += scratch[7].i + scratch[8].i;
183 
184         scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
185         scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
186 
187         scratch[6].r =  S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
188         scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
189 
190         C_SUB(*Fout1,scratch[5],scratch[6]);
191         C_ADD(*Fout4,scratch[5],scratch[6]);
192 
193         scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
194         scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
195         scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
196         scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
197 
198         C_ADD(*Fout2,scratch[11],scratch[12]);
199         C_SUB(*Fout3,scratch[11],scratch[12]);
200 
201         ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
202     }
203 }
204 
205 /* perform the butterfly for one stage of a mixed radix FFT */
kf_bfly_generic(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_cfg st,int m,int p)206 static void kf_bfly_generic(
207         kiss_fft_cpx * Fout,
208         const size_t fstride,
209         const kiss_fft_cfg st,
210         int m,
211         int p
212         )
213 {
214     int u,k,q1,q;
215     kiss_fft_cpx * twiddles = st->twiddles;
216     kiss_fft_cpx t;
217     int Norig = st->nfft;
218 
219     kiss_fft_cpx * scratch = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(sizeof(kiss_fft_cpx)*(size_t)p);
220 
221     for ( u=0; u<m; ++u ) {
222         k=u;
223         for ( q1=0 ; q1<p ; ++q1 ) {
224             scratch[q1] = Fout[ k  ];
225             C_FIXDIV(scratch[q1],p);
226             k += m;
227         }
228 
229         k=u;
230         for ( q1=0 ; q1<p ; ++q1 ) {
231             int twidx=0;
232             Fout[ k ] = scratch[0];
233             for (q=1;q<p;++q ) {
234                 twidx += fstride * (size_t)k;
235                 if (twidx>=Norig) twidx-=Norig;
236                 C_MUL(t,scratch[q] , twiddles[twidx] );
237                 C_ADDTO( Fout[ k ] ,t);
238             }
239             k += m;
240         }
241     }
242     KISS_FFT_TMP_FREE(scratch);
243 }
244 
245 static
kf_work(kiss_fft_cpx * Fout,const kiss_fft_cpx * f,const size_t fstride,int in_stride,int * factors,const kiss_fft_cfg st)246 void kf_work(
247         kiss_fft_cpx * Fout,
248         const kiss_fft_cpx * f,
249         const size_t fstride,
250         int in_stride,
251         int * factors,
252         const kiss_fft_cfg st
253         )
254 {
255     kiss_fft_cpx * Fout_beg=Fout;
256     const int p=*factors++; /* the radix  */
257     const int m=*factors++; /* stage's fft length/p */
258     const kiss_fft_cpx * Fout_end = Fout + p*m;
259 
260 #ifdef _OPENMP
261     // use openmp extensions at the
262     // top-level (not recursive)
263     if (fstride==1 && p<=5)
264     {
265         int k;
266 
267         // execute the p different work units in different threads
268 #       pragma omp parallel for
269         for (k=0;k<p;++k)
270             kf_work( Fout +k*m, f+ fstride*in_stride*k,fstride*p,in_stride,factors,st);
271         // all threads have joined by this point
272 
273         switch (p) {
274             case 2: kf_bfly2(Fout,fstride,st,m); break;
275             case 3: kf_bfly3(Fout,fstride,st,m); break;
276             case 4: kf_bfly4(Fout,fstride,st,m); break;
277             case 5: kf_bfly5(Fout,fstride,st,m); break;
278             default: kf_bfly_generic(Fout,fstride,st,m,p); break;
279         }
280         return;
281     }
282 #endif
283 
284     if (m==1) {
285         do{
286             *Fout = *f;
287             f += fstride*(size_t)in_stride;
288         }while(++Fout != Fout_end );
289     }else{
290         do{
291             // recursive call:
292             // DFT of size m*p performed by doing
293             // p instances of smaller DFTs of size m,
294             // each one takes a decimated version of the input
295             kf_work( Fout , f, fstride*(size_t)p, in_stride, factors,st);
296             f += fstride*(size_t)in_stride;
297         }while( (Fout += m) != Fout_end );
298     }
299 
300     Fout=Fout_beg;
301 
302     // recombine the p smaller DFTs
303     switch (p) {
304         case 2: kf_bfly2(Fout,fstride,st,m); break;
305         case 3: kf_bfly3(Fout,fstride,st,(size_t)m); break;
306         case 4: kf_bfly4(Fout,fstride,st,(size_t)m); break;
307         case 5: kf_bfly5(Fout,fstride,st,m); break;
308         default: kf_bfly_generic(Fout,fstride,st,m,p); break;
309     }
310 }
311 
312 /*  facbuf is populated by p1,m1,p2,m2, ...
313     where
314     p[i] * m[i] = m[i-1]
315     m0 = n                  */
316 static
kf_factor(int n,int * facbuf)317 void kf_factor(int n,int * facbuf)
318 {
319     int p=4;
320     double floor_sqrt;
321     floor_sqrt = floor( sqrt((double)n) );
322 
323     /*factor out powers of 4, powers of 2, then any remaining primes */
324     do {
325         while (n % p) {
326             switch (p) {
327                 case 4: p = 2; break;
328                 case 2: p = 3; break;
329                 default: p += 2; break;
330             }
331             if (p > floor_sqrt)
332                 p = n;          /* no more factors, skip to end */
333         }
334         n /= p;
335         *facbuf++ = p;
336         *facbuf++ = n;
337     } while (n > 1);
338 }
339 
340 /*
341  *
342  * User-callable function to allocate all necessary storage space for the fft.
343  *
344  * The return value is a contiguous block of memory, allocated with malloc.  As such,
345  * It can be freed with free(), rather than a kiss_fft-specific function.
346  * */
kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem)347 kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem )
348 {
349     kiss_fft_cfg st=NULL;
350     size_t memneeded = sizeof(struct kiss_fft_state)
351         + sizeof(kiss_fft_cpx)*(size_t)(nfft-1); /* twiddle factors*/
352 
353     if ( lenmem==NULL ) {
354         st = ( kiss_fft_cfg)KISS_FFT_MALLOC( memneeded );
355     }else{
356         if (mem != NULL && *lenmem >= memneeded)
357             st = (kiss_fft_cfg)mem;
358         *lenmem = memneeded;
359     }
360     if (st) {
361         int i;
362         st->nfft=nfft;
363         st->inverse = inverse_fft;
364 
365         for (i=0;i<nfft;++i) {
366             const double pi=3.141592653589793238462643383279502884197169399375105820974944;
367             double phase = -2*pi*i / nfft;
368             if (st->inverse)
369                 phase *= -1;
370             kf_cexp(st->twiddles+i, phase );
371         }
372 
373         kf_factor(nfft,st->factors);
374     }
375     return st;
376 }
377 
378 
kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx * fin,kiss_fft_cpx * fout,int in_stride)379 void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride)
380 {
381     if (fin == fout) {
382         //NOTE: this is not really an in-place FFT algorithm.
383         //It just performs an out-of-place FFT into a temp buffer
384         kiss_fft_cpx * tmpbuf = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC( sizeof(kiss_fft_cpx)*(size_t)st->nfft);
385         kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
386         memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*(size_t)(st->nfft));
387         KISS_FFT_TMP_FREE(tmpbuf);
388     }else{
389         kf_work( fout, fin, 1,in_stride, st->factors,st );
390     }
391 }
392 
kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx * fin,kiss_fft_cpx * fout)393 void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
394 {
395     kiss_fft_stride(cfg,fin,fout,1);
396 }
397 
398 
kiss_fft_cleanup(void)399 void kiss_fft_cleanup(void)
400 {
401     // nothing needed any more
402 }
403 
kiss_fft_next_fast_size(int n)404 int kiss_fft_next_fast_size(int n)
405 {
406     while(1) {
407         int m=n;
408         while ( (m%2) == 0 ) m/=2;
409         while ( (m%3) == 0 ) m/=3;
410         while ( (m%5) == 0 ) m/=5;
411         if (m<=1)
412             break; /* n is completely factorable by twos, threes, and fives */
413         n++;
414     }
415     return n;
416 }
417 
418 // CHRE modifications begin
419 #if defined(__clang__)
420 #pragma clang diagnostic pop
421 #endif
422 // CHRE modifications end
423