1*a58d3d2aSXin Li /*Copyright (c) 2003-2004, Mark Borgerding
2*a58d3d2aSXin Li Lots of modifications by Jean-Marc Valin
3*a58d3d2aSXin Li Copyright (c) 2005-2007, Xiph.Org Foundation
4*a58d3d2aSXin Li Copyright (c) 2008, Xiph.Org Foundation, CSIRO
5*a58d3d2aSXin Li
6*a58d3d2aSXin Li All rights reserved.
7*a58d3d2aSXin Li
8*a58d3d2aSXin Li Redistribution and use in source and binary forms, with or without
9*a58d3d2aSXin Li modification, are permitted provided that the following conditions are met:
10*a58d3d2aSXin Li
11*a58d3d2aSXin Li * Redistributions of source code must retain the above copyright notice,
12*a58d3d2aSXin Li this list of conditions and the following disclaimer.
13*a58d3d2aSXin Li * Redistributions in binary form must reproduce the above copyright notice,
14*a58d3d2aSXin Li this list of conditions and the following disclaimer in the
15*a58d3d2aSXin Li documentation and/or other materials provided with the distribution.
16*a58d3d2aSXin Li
17*a58d3d2aSXin Li THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18*a58d3d2aSXin Li AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19*a58d3d2aSXin Li IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20*a58d3d2aSXin Li ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21*a58d3d2aSXin Li LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22*a58d3d2aSXin Li CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23*a58d3d2aSXin Li SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24*a58d3d2aSXin Li INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25*a58d3d2aSXin Li CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26*a58d3d2aSXin Li ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27*a58d3d2aSXin Li POSSIBILITY OF SUCH DAMAGE.*/
28*a58d3d2aSXin Li
29*a58d3d2aSXin Li /* This code is originally from Mark Borgerding's KISS-FFT but has been
30*a58d3d2aSXin Li heavily modified to better suit Opus */
31*a58d3d2aSXin Li
32*a58d3d2aSXin Li #ifndef SKIP_CONFIG_H
33*a58d3d2aSXin Li # ifdef HAVE_CONFIG_H
34*a58d3d2aSXin Li # include "config.h"
35*a58d3d2aSXin Li # endif
36*a58d3d2aSXin Li #endif
37*a58d3d2aSXin Li
38*a58d3d2aSXin Li #include "_kiss_fft_guts.h"
39*a58d3d2aSXin Li #include "arch.h"
40*a58d3d2aSXin Li #include "os_support.h"
41*a58d3d2aSXin Li #include "mathops.h"
42*a58d3d2aSXin Li #include "stack_alloc.h"
43*a58d3d2aSXin Li
44*a58d3d2aSXin Li /* The guts header contains all the multiplication and addition macros that are defined for
45*a58d3d2aSXin Li complex numbers. It also delares the kf_ internal functions.
46*a58d3d2aSXin Li */
47*a58d3d2aSXin Li
kf_bfly2(kiss_fft_cpx * Fout,int m,int N)48*a58d3d2aSXin Li static void kf_bfly2(
49*a58d3d2aSXin Li kiss_fft_cpx * Fout,
50*a58d3d2aSXin Li int m,
51*a58d3d2aSXin Li int N
52*a58d3d2aSXin Li )
53*a58d3d2aSXin Li {
54*a58d3d2aSXin Li kiss_fft_cpx * Fout2;
55*a58d3d2aSXin Li int i;
56*a58d3d2aSXin Li (void)m;
57*a58d3d2aSXin Li #ifdef CUSTOM_MODES
58*a58d3d2aSXin Li if (m==1)
59*a58d3d2aSXin Li {
60*a58d3d2aSXin Li celt_assert(m==1);
61*a58d3d2aSXin Li for (i=0;i<N;i++)
62*a58d3d2aSXin Li {
63*a58d3d2aSXin Li kiss_fft_cpx t;
64*a58d3d2aSXin Li Fout2 = Fout + 1;
65*a58d3d2aSXin Li t = *Fout2;
66*a58d3d2aSXin Li C_SUB( *Fout2 , *Fout , t );
67*a58d3d2aSXin Li C_ADDTO( *Fout , t );
68*a58d3d2aSXin Li Fout += 2;
69*a58d3d2aSXin Li }
70*a58d3d2aSXin Li } else
71*a58d3d2aSXin Li #endif
72*a58d3d2aSXin Li {
73*a58d3d2aSXin Li opus_val16 tw;
74*a58d3d2aSXin Li tw = QCONST16(0.7071067812f, 15);
75*a58d3d2aSXin Li /* We know that m==4 here because the radix-2 is just after a radix-4 */
76*a58d3d2aSXin Li celt_assert(m==4);
77*a58d3d2aSXin Li for (i=0;i<N;i++)
78*a58d3d2aSXin Li {
79*a58d3d2aSXin Li kiss_fft_cpx t;
80*a58d3d2aSXin Li Fout2 = Fout + 4;
81*a58d3d2aSXin Li t = Fout2[0];
82*a58d3d2aSXin Li C_SUB( Fout2[0] , Fout[0] , t );
83*a58d3d2aSXin Li C_ADDTO( Fout[0] , t );
84*a58d3d2aSXin Li
85*a58d3d2aSXin Li t.r = S_MUL(ADD32_ovflw(Fout2[1].r, Fout2[1].i), tw);
86*a58d3d2aSXin Li t.i = S_MUL(SUB32_ovflw(Fout2[1].i, Fout2[1].r), tw);
87*a58d3d2aSXin Li C_SUB( Fout2[1] , Fout[1] , t );
88*a58d3d2aSXin Li C_ADDTO( Fout[1] , t );
89*a58d3d2aSXin Li
90*a58d3d2aSXin Li t.r = Fout2[2].i;
91*a58d3d2aSXin Li t.i = -Fout2[2].r;
92*a58d3d2aSXin Li C_SUB( Fout2[2] , Fout[2] , t );
93*a58d3d2aSXin Li C_ADDTO( Fout[2] , t );
94*a58d3d2aSXin Li
95*a58d3d2aSXin Li t.r = S_MUL(SUB32_ovflw(Fout2[3].i, Fout2[3].r), tw);
96*a58d3d2aSXin Li t.i = S_MUL(NEG32_ovflw(ADD32_ovflw(Fout2[3].i, Fout2[3].r)), tw);
97*a58d3d2aSXin Li C_SUB( Fout2[3] , Fout[3] , t );
98*a58d3d2aSXin Li C_ADDTO( Fout[3] , t );
99*a58d3d2aSXin Li Fout += 8;
100*a58d3d2aSXin Li }
101*a58d3d2aSXin Li }
102*a58d3d2aSXin Li }
103*a58d3d2aSXin Li
kf_bfly4(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_state * st,int m,int N,int mm)104*a58d3d2aSXin Li static void kf_bfly4(
105*a58d3d2aSXin Li kiss_fft_cpx * Fout,
106*a58d3d2aSXin Li const size_t fstride,
107*a58d3d2aSXin Li const kiss_fft_state *st,
108*a58d3d2aSXin Li int m,
109*a58d3d2aSXin Li int N,
110*a58d3d2aSXin Li int mm
111*a58d3d2aSXin Li )
112*a58d3d2aSXin Li {
113*a58d3d2aSXin Li int i;
114*a58d3d2aSXin Li
115*a58d3d2aSXin Li if (m==1)
116*a58d3d2aSXin Li {
117*a58d3d2aSXin Li /* Degenerate case where all the twiddles are 1. */
118*a58d3d2aSXin Li for (i=0;i<N;i++)
119*a58d3d2aSXin Li {
120*a58d3d2aSXin Li kiss_fft_cpx scratch0, scratch1;
121*a58d3d2aSXin Li
122*a58d3d2aSXin Li C_SUB( scratch0 , *Fout, Fout[2] );
123*a58d3d2aSXin Li C_ADDTO(*Fout, Fout[2]);
124*a58d3d2aSXin Li C_ADD( scratch1 , Fout[1] , Fout[3] );
125*a58d3d2aSXin Li C_SUB( Fout[2], *Fout, scratch1 );
126*a58d3d2aSXin Li C_ADDTO( *Fout , scratch1 );
127*a58d3d2aSXin Li C_SUB( scratch1 , Fout[1] , Fout[3] );
128*a58d3d2aSXin Li
129*a58d3d2aSXin Li Fout[1].r = ADD32_ovflw(scratch0.r, scratch1.i);
130*a58d3d2aSXin Li Fout[1].i = SUB32_ovflw(scratch0.i, scratch1.r);
131*a58d3d2aSXin Li Fout[3].r = SUB32_ovflw(scratch0.r, scratch1.i);
132*a58d3d2aSXin Li Fout[3].i = ADD32_ovflw(scratch0.i, scratch1.r);
133*a58d3d2aSXin Li Fout+=4;
134*a58d3d2aSXin Li }
135*a58d3d2aSXin Li } else {
136*a58d3d2aSXin Li int j;
137*a58d3d2aSXin Li kiss_fft_cpx scratch[6];
138*a58d3d2aSXin Li const kiss_twiddle_cpx *tw1,*tw2,*tw3;
139*a58d3d2aSXin Li const int m2=2*m;
140*a58d3d2aSXin Li const int m3=3*m;
141*a58d3d2aSXin Li kiss_fft_cpx * Fout_beg = Fout;
142*a58d3d2aSXin Li for (i=0;i<N;i++)
143*a58d3d2aSXin Li {
144*a58d3d2aSXin Li Fout = Fout_beg + i*mm;
145*a58d3d2aSXin Li tw3 = tw2 = tw1 = st->twiddles;
146*a58d3d2aSXin Li /* m is guaranteed to be a multiple of 4. */
147*a58d3d2aSXin Li for (j=0;j<m;j++)
148*a58d3d2aSXin Li {
149*a58d3d2aSXin Li C_MUL(scratch[0],Fout[m] , *tw1 );
150*a58d3d2aSXin Li C_MUL(scratch[1],Fout[m2] , *tw2 );
151*a58d3d2aSXin Li C_MUL(scratch[2],Fout[m3] , *tw3 );
152*a58d3d2aSXin Li
153*a58d3d2aSXin Li C_SUB( scratch[5] , *Fout, scratch[1] );
154*a58d3d2aSXin Li C_ADDTO(*Fout, scratch[1]);
155*a58d3d2aSXin Li C_ADD( scratch[3] , scratch[0] , scratch[2] );
156*a58d3d2aSXin Li C_SUB( scratch[4] , scratch[0] , scratch[2] );
157*a58d3d2aSXin Li C_SUB( Fout[m2], *Fout, scratch[3] );
158*a58d3d2aSXin Li tw1 += fstride;
159*a58d3d2aSXin Li tw2 += fstride*2;
160*a58d3d2aSXin Li tw3 += fstride*3;
161*a58d3d2aSXin Li C_ADDTO( *Fout , scratch[3] );
162*a58d3d2aSXin Li
163*a58d3d2aSXin Li Fout[m].r = ADD32_ovflw(scratch[5].r, scratch[4].i);
164*a58d3d2aSXin Li Fout[m].i = SUB32_ovflw(scratch[5].i, scratch[4].r);
165*a58d3d2aSXin Li Fout[m3].r = SUB32_ovflw(scratch[5].r, scratch[4].i);
166*a58d3d2aSXin Li Fout[m3].i = ADD32_ovflw(scratch[5].i, scratch[4].r);
167*a58d3d2aSXin Li ++Fout;
168*a58d3d2aSXin Li }
169*a58d3d2aSXin Li }
170*a58d3d2aSXin Li }
171*a58d3d2aSXin Li }
172*a58d3d2aSXin Li
173*a58d3d2aSXin Li
174*a58d3d2aSXin Li #ifndef RADIX_TWO_ONLY
175*a58d3d2aSXin Li
kf_bfly3(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_state * st,int m,int N,int mm)176*a58d3d2aSXin Li static void kf_bfly3(
177*a58d3d2aSXin Li kiss_fft_cpx * Fout,
178*a58d3d2aSXin Li const size_t fstride,
179*a58d3d2aSXin Li const kiss_fft_state *st,
180*a58d3d2aSXin Li int m,
181*a58d3d2aSXin Li int N,
182*a58d3d2aSXin Li int mm
183*a58d3d2aSXin Li )
184*a58d3d2aSXin Li {
185*a58d3d2aSXin Li int i;
186*a58d3d2aSXin Li size_t k;
187*a58d3d2aSXin Li const size_t m2 = 2*m;
188*a58d3d2aSXin Li const kiss_twiddle_cpx *tw1,*tw2;
189*a58d3d2aSXin Li kiss_fft_cpx scratch[5];
190*a58d3d2aSXin Li kiss_twiddle_cpx epi3;
191*a58d3d2aSXin Li
192*a58d3d2aSXin Li kiss_fft_cpx * Fout_beg = Fout;
193*a58d3d2aSXin Li #ifdef FIXED_POINT
194*a58d3d2aSXin Li /*epi3.r = -16384;*/ /* Unused */
195*a58d3d2aSXin Li epi3.i = -28378;
196*a58d3d2aSXin Li #else
197*a58d3d2aSXin Li epi3 = st->twiddles[fstride*m];
198*a58d3d2aSXin Li #endif
199*a58d3d2aSXin Li for (i=0;i<N;i++)
200*a58d3d2aSXin Li {
201*a58d3d2aSXin Li Fout = Fout_beg + i*mm;
202*a58d3d2aSXin Li tw1=tw2=st->twiddles;
203*a58d3d2aSXin Li /* For non-custom modes, m is guaranteed to be a multiple of 4. */
204*a58d3d2aSXin Li k=m;
205*a58d3d2aSXin Li do {
206*a58d3d2aSXin Li
207*a58d3d2aSXin Li C_MUL(scratch[1],Fout[m] , *tw1);
208*a58d3d2aSXin Li C_MUL(scratch[2],Fout[m2] , *tw2);
209*a58d3d2aSXin Li
210*a58d3d2aSXin Li C_ADD(scratch[3],scratch[1],scratch[2]);
211*a58d3d2aSXin Li C_SUB(scratch[0],scratch[1],scratch[2]);
212*a58d3d2aSXin Li tw1 += fstride;
213*a58d3d2aSXin Li tw2 += fstride*2;
214*a58d3d2aSXin Li
215*a58d3d2aSXin Li Fout[m].r = SUB32_ovflw(Fout->r, HALF_OF(scratch[3].r));
216*a58d3d2aSXin Li Fout[m].i = SUB32_ovflw(Fout->i, HALF_OF(scratch[3].i));
217*a58d3d2aSXin Li
218*a58d3d2aSXin Li C_MULBYSCALAR( scratch[0] , epi3.i );
219*a58d3d2aSXin Li
220*a58d3d2aSXin Li C_ADDTO(*Fout,scratch[3]);
221*a58d3d2aSXin Li
222*a58d3d2aSXin Li Fout[m2].r = ADD32_ovflw(Fout[m].r, scratch[0].i);
223*a58d3d2aSXin Li Fout[m2].i = SUB32_ovflw(Fout[m].i, scratch[0].r);
224*a58d3d2aSXin Li
225*a58d3d2aSXin Li Fout[m].r = SUB32_ovflw(Fout[m].r, scratch[0].i);
226*a58d3d2aSXin Li Fout[m].i = ADD32_ovflw(Fout[m].i, scratch[0].r);
227*a58d3d2aSXin Li
228*a58d3d2aSXin Li ++Fout;
229*a58d3d2aSXin Li } while(--k);
230*a58d3d2aSXin Li }
231*a58d3d2aSXin Li }
232*a58d3d2aSXin Li
233*a58d3d2aSXin Li
234*a58d3d2aSXin Li #ifndef OVERRIDE_kf_bfly5
kf_bfly5(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_state * st,int m,int N,int mm)235*a58d3d2aSXin Li static void kf_bfly5(
236*a58d3d2aSXin Li kiss_fft_cpx * Fout,
237*a58d3d2aSXin Li const size_t fstride,
238*a58d3d2aSXin Li const kiss_fft_state *st,
239*a58d3d2aSXin Li int m,
240*a58d3d2aSXin Li int N,
241*a58d3d2aSXin Li int mm
242*a58d3d2aSXin Li )
243*a58d3d2aSXin Li {
244*a58d3d2aSXin Li kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
245*a58d3d2aSXin Li int i, u;
246*a58d3d2aSXin Li kiss_fft_cpx scratch[13];
247*a58d3d2aSXin Li const kiss_twiddle_cpx *tw;
248*a58d3d2aSXin Li kiss_twiddle_cpx ya,yb;
249*a58d3d2aSXin Li kiss_fft_cpx * Fout_beg = Fout;
250*a58d3d2aSXin Li
251*a58d3d2aSXin Li #ifdef FIXED_POINT
252*a58d3d2aSXin Li ya.r = 10126;
253*a58d3d2aSXin Li ya.i = -31164;
254*a58d3d2aSXin Li yb.r = -26510;
255*a58d3d2aSXin Li yb.i = -19261;
256*a58d3d2aSXin Li #else
257*a58d3d2aSXin Li ya = st->twiddles[fstride*m];
258*a58d3d2aSXin Li yb = st->twiddles[fstride*2*m];
259*a58d3d2aSXin Li #endif
260*a58d3d2aSXin Li tw=st->twiddles;
261*a58d3d2aSXin Li
262*a58d3d2aSXin Li for (i=0;i<N;i++)
263*a58d3d2aSXin Li {
264*a58d3d2aSXin Li Fout = Fout_beg + i*mm;
265*a58d3d2aSXin Li Fout0=Fout;
266*a58d3d2aSXin Li Fout1=Fout0+m;
267*a58d3d2aSXin Li Fout2=Fout0+2*m;
268*a58d3d2aSXin Li Fout3=Fout0+3*m;
269*a58d3d2aSXin Li Fout4=Fout0+4*m;
270*a58d3d2aSXin Li
271*a58d3d2aSXin Li /* For non-custom modes, m is guaranteed to be a multiple of 4. */
272*a58d3d2aSXin Li for ( u=0; u<m; ++u ) {
273*a58d3d2aSXin Li scratch[0] = *Fout0;
274*a58d3d2aSXin Li
275*a58d3d2aSXin Li C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
276*a58d3d2aSXin Li C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
277*a58d3d2aSXin Li C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
278*a58d3d2aSXin Li C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
279*a58d3d2aSXin Li
280*a58d3d2aSXin Li C_ADD( scratch[7],scratch[1],scratch[4]);
281*a58d3d2aSXin Li C_SUB( scratch[10],scratch[1],scratch[4]);
282*a58d3d2aSXin Li C_ADD( scratch[8],scratch[2],scratch[3]);
283*a58d3d2aSXin Li C_SUB( scratch[9],scratch[2],scratch[3]);
284*a58d3d2aSXin Li
285*a58d3d2aSXin Li Fout0->r = ADD32_ovflw(Fout0->r, ADD32_ovflw(scratch[7].r, scratch[8].r));
286*a58d3d2aSXin Li Fout0->i = ADD32_ovflw(Fout0->i, ADD32_ovflw(scratch[7].i, scratch[8].i));
287*a58d3d2aSXin Li
288*a58d3d2aSXin Li scratch[5].r = ADD32_ovflw(scratch[0].r, ADD32_ovflw(S_MUL(scratch[7].r,ya.r), S_MUL(scratch[8].r,yb.r)));
289*a58d3d2aSXin Li scratch[5].i = ADD32_ovflw(scratch[0].i, ADD32_ovflw(S_MUL(scratch[7].i,ya.r), S_MUL(scratch[8].i,yb.r)));
290*a58d3d2aSXin Li
291*a58d3d2aSXin Li scratch[6].r = ADD32_ovflw(S_MUL(scratch[10].i,ya.i), S_MUL(scratch[9].i,yb.i));
292*a58d3d2aSXin Li scratch[6].i = NEG32_ovflw(ADD32_ovflw(S_MUL(scratch[10].r,ya.i), S_MUL(scratch[9].r,yb.i)));
293*a58d3d2aSXin Li
294*a58d3d2aSXin Li C_SUB(*Fout1,scratch[5],scratch[6]);
295*a58d3d2aSXin Li C_ADD(*Fout4,scratch[5],scratch[6]);
296*a58d3d2aSXin Li
297*a58d3d2aSXin Li scratch[11].r = ADD32_ovflw(scratch[0].r, ADD32_ovflw(S_MUL(scratch[7].r,yb.r), S_MUL(scratch[8].r,ya.r)));
298*a58d3d2aSXin Li scratch[11].i = ADD32_ovflw(scratch[0].i, ADD32_ovflw(S_MUL(scratch[7].i,yb.r), S_MUL(scratch[8].i,ya.r)));
299*a58d3d2aSXin Li scratch[12].r = SUB32_ovflw(S_MUL(scratch[9].i,ya.i), S_MUL(scratch[10].i,yb.i));
300*a58d3d2aSXin Li scratch[12].i = SUB32_ovflw(S_MUL(scratch[10].r,yb.i), S_MUL(scratch[9].r,ya.i));
301*a58d3d2aSXin Li
302*a58d3d2aSXin Li C_ADD(*Fout2,scratch[11],scratch[12]);
303*a58d3d2aSXin Li C_SUB(*Fout3,scratch[11],scratch[12]);
304*a58d3d2aSXin Li
305*a58d3d2aSXin Li ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
306*a58d3d2aSXin Li }
307*a58d3d2aSXin Li }
308*a58d3d2aSXin Li }
309*a58d3d2aSXin Li #endif /* OVERRIDE_kf_bfly5 */
310*a58d3d2aSXin Li
311*a58d3d2aSXin Li
312*a58d3d2aSXin Li #endif
313*a58d3d2aSXin Li
314*a58d3d2aSXin Li
315*a58d3d2aSXin Li #ifdef CUSTOM_MODES
316*a58d3d2aSXin Li
317*a58d3d2aSXin Li static
compute_bitrev_table(int Fout,opus_int16 * f,const size_t fstride,int in_stride,opus_int16 * factors,const kiss_fft_state * st)318*a58d3d2aSXin Li void compute_bitrev_table(
319*a58d3d2aSXin Li int Fout,
320*a58d3d2aSXin Li opus_int16 *f,
321*a58d3d2aSXin Li const size_t fstride,
322*a58d3d2aSXin Li int in_stride,
323*a58d3d2aSXin Li opus_int16 * factors,
324*a58d3d2aSXin Li const kiss_fft_state *st
325*a58d3d2aSXin Li )
326*a58d3d2aSXin Li {
327*a58d3d2aSXin Li const int p=*factors++; /* the radix */
328*a58d3d2aSXin Li const int m=*factors++; /* stage's fft length/p */
329*a58d3d2aSXin Li
330*a58d3d2aSXin Li /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/
331*a58d3d2aSXin Li if (m==1)
332*a58d3d2aSXin Li {
333*a58d3d2aSXin Li int j;
334*a58d3d2aSXin Li for (j=0;j<p;j++)
335*a58d3d2aSXin Li {
336*a58d3d2aSXin Li *f = Fout+j;
337*a58d3d2aSXin Li f += fstride*in_stride;
338*a58d3d2aSXin Li }
339*a58d3d2aSXin Li } else {
340*a58d3d2aSXin Li int j;
341*a58d3d2aSXin Li for (j=0;j<p;j++)
342*a58d3d2aSXin Li {
343*a58d3d2aSXin Li compute_bitrev_table( Fout , f, fstride*p, in_stride, factors,st);
344*a58d3d2aSXin Li f += fstride*in_stride;
345*a58d3d2aSXin Li Fout += m;
346*a58d3d2aSXin Li }
347*a58d3d2aSXin Li }
348*a58d3d2aSXin Li }
349*a58d3d2aSXin Li
350*a58d3d2aSXin Li /* facbuf is populated by p1,m1,p2,m2, ...
351*a58d3d2aSXin Li where
352*a58d3d2aSXin Li p[i] * m[i] = m[i-1]
353*a58d3d2aSXin Li m0 = n */
354*a58d3d2aSXin Li static
kf_factor(int n,opus_int16 * facbuf)355*a58d3d2aSXin Li int kf_factor(int n,opus_int16 * facbuf)
356*a58d3d2aSXin Li {
357*a58d3d2aSXin Li int p=4;
358*a58d3d2aSXin Li int i;
359*a58d3d2aSXin Li int stages=0;
360*a58d3d2aSXin Li int nbak = n;
361*a58d3d2aSXin Li
362*a58d3d2aSXin Li /*factor out powers of 4, powers of 2, then any remaining primes */
363*a58d3d2aSXin Li do {
364*a58d3d2aSXin Li while (n % p) {
365*a58d3d2aSXin Li switch (p) {
366*a58d3d2aSXin Li case 4: p = 2; break;
367*a58d3d2aSXin Li case 2: p = 3; break;
368*a58d3d2aSXin Li default: p += 2; break;
369*a58d3d2aSXin Li }
370*a58d3d2aSXin Li if (p>32000 || (opus_int32)p*(opus_int32)p > n)
371*a58d3d2aSXin Li p = n; /* no more factors, skip to end */
372*a58d3d2aSXin Li }
373*a58d3d2aSXin Li n /= p;
374*a58d3d2aSXin Li #ifdef RADIX_TWO_ONLY
375*a58d3d2aSXin Li if (p!=2 && p != 4)
376*a58d3d2aSXin Li #else
377*a58d3d2aSXin Li if (p>5)
378*a58d3d2aSXin Li #endif
379*a58d3d2aSXin Li {
380*a58d3d2aSXin Li return 0;
381*a58d3d2aSXin Li }
382*a58d3d2aSXin Li facbuf[2*stages] = p;
383*a58d3d2aSXin Li if (p==2 && stages > 1)
384*a58d3d2aSXin Li {
385*a58d3d2aSXin Li facbuf[2*stages] = 4;
386*a58d3d2aSXin Li facbuf[2] = 2;
387*a58d3d2aSXin Li }
388*a58d3d2aSXin Li stages++;
389*a58d3d2aSXin Li } while (n > 1);
390*a58d3d2aSXin Li n = nbak;
391*a58d3d2aSXin Li /* Reverse the order to get the radix 4 at the end, so we can use the
392*a58d3d2aSXin Li fast degenerate case. It turns out that reversing the order also
393*a58d3d2aSXin Li improves the noise behaviour. */
394*a58d3d2aSXin Li for (i=0;i<stages/2;i++)
395*a58d3d2aSXin Li {
396*a58d3d2aSXin Li int tmp;
397*a58d3d2aSXin Li tmp = facbuf[2*i];
398*a58d3d2aSXin Li facbuf[2*i] = facbuf[2*(stages-i-1)];
399*a58d3d2aSXin Li facbuf[2*(stages-i-1)] = tmp;
400*a58d3d2aSXin Li }
401*a58d3d2aSXin Li for (i=0;i<stages;i++)
402*a58d3d2aSXin Li {
403*a58d3d2aSXin Li n /= facbuf[2*i];
404*a58d3d2aSXin Li facbuf[2*i+1] = n;
405*a58d3d2aSXin Li }
406*a58d3d2aSXin Li return 1;
407*a58d3d2aSXin Li }
408*a58d3d2aSXin Li
compute_twiddles(kiss_twiddle_cpx * twiddles,int nfft)409*a58d3d2aSXin Li static void compute_twiddles(kiss_twiddle_cpx *twiddles, int nfft)
410*a58d3d2aSXin Li {
411*a58d3d2aSXin Li int i;
412*a58d3d2aSXin Li #ifdef FIXED_POINT
413*a58d3d2aSXin Li for (i=0;i<nfft;++i) {
414*a58d3d2aSXin Li opus_val32 phase = -i;
415*a58d3d2aSXin Li kf_cexp2(twiddles+i, DIV32(SHL32(phase,17),nfft));
416*a58d3d2aSXin Li }
417*a58d3d2aSXin Li #else
418*a58d3d2aSXin Li for (i=0;i<nfft;++i) {
419*a58d3d2aSXin Li const double pi=3.14159265358979323846264338327;
420*a58d3d2aSXin Li double phase = ( -2*pi /nfft ) * i;
421*a58d3d2aSXin Li kf_cexp(twiddles+i, phase );
422*a58d3d2aSXin Li }
423*a58d3d2aSXin Li #endif
424*a58d3d2aSXin Li }
425*a58d3d2aSXin Li
opus_fft_alloc_arch_c(kiss_fft_state * st)426*a58d3d2aSXin Li int opus_fft_alloc_arch_c(kiss_fft_state *st) {
427*a58d3d2aSXin Li (void)st;
428*a58d3d2aSXin Li return 0;
429*a58d3d2aSXin Li }
430*a58d3d2aSXin Li
431*a58d3d2aSXin Li /*
432*a58d3d2aSXin Li *
433*a58d3d2aSXin Li * Allocates all necessary storage space for the fft and ifft.
434*a58d3d2aSXin Li * The return value is a contiguous block of memory. As such,
435*a58d3d2aSXin Li * It can be freed with free().
436*a58d3d2aSXin Li * */
opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,const kiss_fft_state * base,int arch)437*a58d3d2aSXin Li kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,
438*a58d3d2aSXin Li const kiss_fft_state *base, int arch)
439*a58d3d2aSXin Li {
440*a58d3d2aSXin Li kiss_fft_state *st=NULL;
441*a58d3d2aSXin Li size_t memneeded = sizeof(struct kiss_fft_state); /* twiddle factors*/
442*a58d3d2aSXin Li
443*a58d3d2aSXin Li if ( lenmem==NULL ) {
444*a58d3d2aSXin Li st = ( kiss_fft_state*)KISS_FFT_MALLOC( memneeded );
445*a58d3d2aSXin Li }else{
446*a58d3d2aSXin Li if (mem != NULL && *lenmem >= memneeded)
447*a58d3d2aSXin Li st = (kiss_fft_state*)mem;
448*a58d3d2aSXin Li *lenmem = memneeded;
449*a58d3d2aSXin Li }
450*a58d3d2aSXin Li if (st) {
451*a58d3d2aSXin Li opus_int16 *bitrev;
452*a58d3d2aSXin Li kiss_twiddle_cpx *twiddles;
453*a58d3d2aSXin Li
454*a58d3d2aSXin Li st->nfft=nfft;
455*a58d3d2aSXin Li #ifdef FIXED_POINT
456*a58d3d2aSXin Li st->scale_shift = celt_ilog2(st->nfft);
457*a58d3d2aSXin Li if (st->nfft == 1<<st->scale_shift)
458*a58d3d2aSXin Li st->scale = Q15ONE;
459*a58d3d2aSXin Li else
460*a58d3d2aSXin Li st->scale = (1073741824+st->nfft/2)/st->nfft>>(15-st->scale_shift);
461*a58d3d2aSXin Li #else
462*a58d3d2aSXin Li st->scale = 1.f/nfft;
463*a58d3d2aSXin Li #endif
464*a58d3d2aSXin Li if (base != NULL)
465*a58d3d2aSXin Li {
466*a58d3d2aSXin Li st->twiddles = base->twiddles;
467*a58d3d2aSXin Li st->shift = 0;
468*a58d3d2aSXin Li while (st->shift < 32 && nfft<<st->shift != base->nfft)
469*a58d3d2aSXin Li st->shift++;
470*a58d3d2aSXin Li if (st->shift>=32)
471*a58d3d2aSXin Li goto fail;
472*a58d3d2aSXin Li } else {
473*a58d3d2aSXin Li st->twiddles = twiddles = (kiss_twiddle_cpx*)KISS_FFT_MALLOC(sizeof(kiss_twiddle_cpx)*nfft);
474*a58d3d2aSXin Li compute_twiddles(twiddles, nfft);
475*a58d3d2aSXin Li st->shift = -1;
476*a58d3d2aSXin Li }
477*a58d3d2aSXin Li if (!kf_factor(nfft,st->factors))
478*a58d3d2aSXin Li {
479*a58d3d2aSXin Li goto fail;
480*a58d3d2aSXin Li }
481*a58d3d2aSXin Li
482*a58d3d2aSXin Li /* bitrev */
483*a58d3d2aSXin Li st->bitrev = bitrev = (opus_int16*)KISS_FFT_MALLOC(sizeof(opus_int16)*nfft);
484*a58d3d2aSXin Li if (st->bitrev==NULL)
485*a58d3d2aSXin Li goto fail;
486*a58d3d2aSXin Li compute_bitrev_table(0, bitrev, 1,1, st->factors,st);
487*a58d3d2aSXin Li
488*a58d3d2aSXin Li /* Initialize architecture specific fft parameters */
489*a58d3d2aSXin Li if (opus_fft_alloc_arch(st, arch))
490*a58d3d2aSXin Li goto fail;
491*a58d3d2aSXin Li }
492*a58d3d2aSXin Li return st;
493*a58d3d2aSXin Li fail:
494*a58d3d2aSXin Li opus_fft_free(st, arch);
495*a58d3d2aSXin Li return NULL;
496*a58d3d2aSXin Li }
497*a58d3d2aSXin Li
opus_fft_alloc(int nfft,void * mem,size_t * lenmem,int arch)498*a58d3d2aSXin Li kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch)
499*a58d3d2aSXin Li {
500*a58d3d2aSXin Li return opus_fft_alloc_twiddles(nfft, mem, lenmem, NULL, arch);
501*a58d3d2aSXin Li }
502*a58d3d2aSXin Li
opus_fft_free_arch_c(kiss_fft_state * st)503*a58d3d2aSXin Li void opus_fft_free_arch_c(kiss_fft_state *st) {
504*a58d3d2aSXin Li (void)st;
505*a58d3d2aSXin Li }
506*a58d3d2aSXin Li
opus_fft_free(const kiss_fft_state * cfg,int arch)507*a58d3d2aSXin Li void opus_fft_free(const kiss_fft_state *cfg, int arch)
508*a58d3d2aSXin Li {
509*a58d3d2aSXin Li if (cfg)
510*a58d3d2aSXin Li {
511*a58d3d2aSXin Li opus_fft_free_arch((kiss_fft_state *)cfg, arch);
512*a58d3d2aSXin Li opus_free((opus_int16*)cfg->bitrev);
513*a58d3d2aSXin Li if (cfg->shift < 0)
514*a58d3d2aSXin Li opus_free((kiss_twiddle_cpx*)cfg->twiddles);
515*a58d3d2aSXin Li opus_free((kiss_fft_state*)cfg);
516*a58d3d2aSXin Li }
517*a58d3d2aSXin Li }
518*a58d3d2aSXin Li
519*a58d3d2aSXin Li #endif /* CUSTOM_MODES */
520*a58d3d2aSXin Li
opus_fft_impl(const kiss_fft_state * st,kiss_fft_cpx * fout)521*a58d3d2aSXin Li void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout)
522*a58d3d2aSXin Li {
523*a58d3d2aSXin Li int m2, m;
524*a58d3d2aSXin Li int p;
525*a58d3d2aSXin Li int L;
526*a58d3d2aSXin Li int fstride[MAXFACTORS];
527*a58d3d2aSXin Li int i;
528*a58d3d2aSXin Li int shift;
529*a58d3d2aSXin Li
530*a58d3d2aSXin Li /* st->shift can be -1 */
531*a58d3d2aSXin Li shift = st->shift>0 ? st->shift : 0;
532*a58d3d2aSXin Li
533*a58d3d2aSXin Li fstride[0] = 1;
534*a58d3d2aSXin Li L=0;
535*a58d3d2aSXin Li do {
536*a58d3d2aSXin Li p = st->factors[2*L];
537*a58d3d2aSXin Li m = st->factors[2*L+1];
538*a58d3d2aSXin Li fstride[L+1] = fstride[L]*p;
539*a58d3d2aSXin Li L++;
540*a58d3d2aSXin Li } while(m!=1);
541*a58d3d2aSXin Li m = st->factors[2*L-1];
542*a58d3d2aSXin Li for (i=L-1;i>=0;i--)
543*a58d3d2aSXin Li {
544*a58d3d2aSXin Li if (i!=0)
545*a58d3d2aSXin Li m2 = st->factors[2*i-1];
546*a58d3d2aSXin Li else
547*a58d3d2aSXin Li m2 = 1;
548*a58d3d2aSXin Li switch (st->factors[2*i])
549*a58d3d2aSXin Li {
550*a58d3d2aSXin Li case 2:
551*a58d3d2aSXin Li kf_bfly2(fout, m, fstride[i]);
552*a58d3d2aSXin Li break;
553*a58d3d2aSXin Li case 4:
554*a58d3d2aSXin Li kf_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2);
555*a58d3d2aSXin Li break;
556*a58d3d2aSXin Li #ifndef RADIX_TWO_ONLY
557*a58d3d2aSXin Li case 3:
558*a58d3d2aSXin Li kf_bfly3(fout,fstride[i]<<shift,st,m, fstride[i], m2);
559*a58d3d2aSXin Li break;
560*a58d3d2aSXin Li case 5:
561*a58d3d2aSXin Li kf_bfly5(fout,fstride[i]<<shift,st,m, fstride[i], m2);
562*a58d3d2aSXin Li break;
563*a58d3d2aSXin Li #endif
564*a58d3d2aSXin Li }
565*a58d3d2aSXin Li m = m2;
566*a58d3d2aSXin Li }
567*a58d3d2aSXin Li }
568*a58d3d2aSXin Li
opus_fft_c(const kiss_fft_state * st,const kiss_fft_cpx * fin,kiss_fft_cpx * fout)569*a58d3d2aSXin Li void opus_fft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
570*a58d3d2aSXin Li {
571*a58d3d2aSXin Li int i;
572*a58d3d2aSXin Li opus_val16 scale;
573*a58d3d2aSXin Li #ifdef FIXED_POINT
574*a58d3d2aSXin Li /* Allows us to scale with MULT16_32_Q16(), which is faster than
575*a58d3d2aSXin Li MULT16_32_Q15() on ARM. */
576*a58d3d2aSXin Li int scale_shift = st->scale_shift-1;
577*a58d3d2aSXin Li #endif
578*a58d3d2aSXin Li scale = st->scale;
579*a58d3d2aSXin Li
580*a58d3d2aSXin Li celt_assert2 (fin != fout, "In-place FFT not supported");
581*a58d3d2aSXin Li /* Bit-reverse the input */
582*a58d3d2aSXin Li for (i=0;i<st->nfft;i++)
583*a58d3d2aSXin Li {
584*a58d3d2aSXin Li kiss_fft_cpx x = fin[i];
585*a58d3d2aSXin Li fout[st->bitrev[i]].r = SHR32(MULT16_32_Q16(scale, x.r), scale_shift);
586*a58d3d2aSXin Li fout[st->bitrev[i]].i = SHR32(MULT16_32_Q16(scale, x.i), scale_shift);
587*a58d3d2aSXin Li }
588*a58d3d2aSXin Li opus_fft_impl(st, fout);
589*a58d3d2aSXin Li }
590*a58d3d2aSXin Li
591*a58d3d2aSXin Li
opus_ifft_c(const kiss_fft_state * st,const kiss_fft_cpx * fin,kiss_fft_cpx * fout)592*a58d3d2aSXin Li void opus_ifft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
593*a58d3d2aSXin Li {
594*a58d3d2aSXin Li int i;
595*a58d3d2aSXin Li celt_assert2 (fin != fout, "In-place FFT not supported");
596*a58d3d2aSXin Li /* Bit-reverse the input */
597*a58d3d2aSXin Li for (i=0;i<st->nfft;i++)
598*a58d3d2aSXin Li fout[st->bitrev[i]] = fin[i];
599*a58d3d2aSXin Li for (i=0;i<st->nfft;i++)
600*a58d3d2aSXin Li fout[i].i = -fout[i].i;
601*a58d3d2aSXin Li opus_fft_impl(st, fout);
602*a58d3d2aSXin Li for (i=0;i<st->nfft;i++)
603*a58d3d2aSXin Li fout[i].i = -fout[i].i;
604*a58d3d2aSXin Li }
605