1 /*
2 Copyright (c) 2019 Hayati Ayguen ( [email protected] )
3 */
4
5 #include "pffastconv.h"
6 #include "pffft.h"
7
8 #include <stdlib.h>
9 #include <stdint.h>
10 #include <stdio.h>
11 #include <math.h>
12 #include <assert.h>
13 #include <string.h>
14
15 #define FASTCONV_DBG_OUT 0
16
17
18 /* detect compiler flavour */
19 #if defined(_MSC_VER)
20 # define RESTRICT __restrict
21 #pragma warning( disable : 4244 4305 4204 4456 )
22 #elif defined(__GNUC__)
23 # define RESTRICT __restrict
24 #endif
25
26
pffastconv_malloc(size_t nb_bytes)27 void *pffastconv_malloc(size_t nb_bytes)
28 {
29 return pffft_aligned_malloc(nb_bytes);
30 }
31
pffastconv_free(void * p)32 void pffastconv_free(void *p)
33 {
34 pffft_aligned_free(p);
35 }
36
pffastconv_simd_size()37 int pffastconv_simd_size()
38 {
39 return pffft_simd_size();
40 }
41
42
43
44 struct PFFASTCONV_Setup
45 {
46 float * Xt; /* input == x in time domain - copy for alignment */
47 float * Xf; /* input == X in freq domain */
48 float * Hf; /* filterCoeffs == H in freq domain */
49 float * Mf; /* input * filterCoeffs in freq domain */
50 PFFFT_Setup *st;
51 int filterLen; /* convolution length */
52 int Nfft; /* FFT/block length */
53 int flags;
54 float scale;
55 };
56
57
pffastconv_new_setup(const float * filterCoeffs,int filterLen,int * blockLen,int flags)58 PFFASTCONV_Setup * pffastconv_new_setup( const float * filterCoeffs, int filterLen, int * blockLen, int flags )
59 {
60 PFFASTCONV_Setup * s = NULL;
61 const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
62 const int minFftLen = 2*pffft_simd_size()*pffft_simd_size();
63 int i, Nfft = 2 * pffft_next_power_of_two(filterLen -1);
64 #if FASTCONV_DBG_OUT
65 const int iOldBlkLen = *blockLen;
66 #endif
67
68 if ( Nfft < minFftLen )
69 Nfft = minFftLen;
70
71 if ( flags & PFFASTCONV_CPLX_FILTER )
72 return NULL;
73
74 s = pffastconv_malloc( sizeof(struct PFFASTCONV_Setup) );
75
76 if ( *blockLen > Nfft ) {
77 Nfft = *blockLen;
78 Nfft = pffft_next_power_of_two(Nfft);
79 }
80 *blockLen = Nfft; /* this is in (complex) samples */
81
82 Nfft *= cplxFactor;
83
84 if ( (flags & PFFASTCONV_DIRECT_INP) && !(flags & PFFASTCONV_CPLX_INP_OUT) )
85 s->Xt = NULL;
86 else
87 s->Xt = pffastconv_malloc((unsigned)Nfft * sizeof(float));
88 s->Xf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
89 s->Hf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
90 s->Mf = pffastconv_malloc((unsigned)Nfft * sizeof(float));
91 s->st = pffft_new_setup(Nfft, PFFFT_REAL); /* with complex: we do 2 x fft() */
92 s->filterLen = filterLen; /* filterLen == convolution length == length of impulse response */
93 if ( cplxFactor == 2 )
94 s->filterLen = 2 * filterLen - 1;
95 s->Nfft = Nfft; /* FFT/block length */
96 s->flags = flags;
97 s->scale = (float)( 1.0 / Nfft );
98
99 memset( s->Xt, 0, (unsigned)Nfft * sizeof(float) );
100 if ( flags & PFFASTCONV_CORRELATION ) {
101 for ( i = 0; i < filterLen; ++i )
102 s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ i ];
103 } else {
104 for ( i = 0; i < filterLen; ++i )
105 s->Xt[ ( Nfft - cplxFactor * i ) & (Nfft -1) ] = filterCoeffs[ filterLen - 1 - i ];
106 }
107
108 pffft_transform(s->st, s->Xt, s->Hf, /* tmp = */ s->Mf, PFFFT_FORWARD);
109
110 #if FASTCONV_DBG_OUT
111 printf("\n fastConvSetup(filterLen = %d, blockLen %d) --> blockLen %d, OutLen = %d\n"
112 , filterLen, iOldBlkLen, *blockLen, Nfft - filterLen +1 );
113 #endif
114
115 return s;
116 }
117
118
pffastconv_destroy_setup(PFFASTCONV_Setup * s)119 void pffastconv_destroy_setup( PFFASTCONV_Setup * s )
120 {
121 if (!s)
122 return;
123 pffft_destroy_setup(s->st);
124 pffastconv_free(s->Mf);
125 pffastconv_free(s->Hf);
126 pffastconv_free(s->Xf);
127 if ( s->Xt )
128 pffastconv_free(s->Xt);
129 pffastconv_free(s);
130 }
131
132
pffastconv_apply(PFFASTCONV_Setup * s,const float * input_,int cplxInputLen,float * output_,int applyFlush)133 int pffastconv_apply(PFFASTCONV_Setup * s, const float *input_, int cplxInputLen, float *output_, int applyFlush)
134 {
135 const float * RESTRICT X = input_;
136 float * RESTRICT Y = output_;
137 const int Nfft = s->Nfft;
138 const int filterLen = s->filterLen;
139 const int flags = s->flags;
140 const int cplxFactor = ( (flags & PFFASTCONV_CPLX_INP_OUT) && (flags & PFFASTCONV_CPLX_SINGLE_FFT) ) ? 2 : 1;
141 const int inputLen = cplxFactor * cplxInputLen;
142 int inpOff, procLen, numOut = 0, j, part, cplxOff;
143
144 /* applyFlush != 0:
145 * inputLen - inpOff -filterLen + 1 > 0
146 * <=> inputLen -filterLen + 1 > inpOff
147 * <=> inpOff < inputLen -filterLen + 1
148 *
149 * applyFlush == 0:
150 * inputLen - inpOff >= Nfft
151 * <=> inputLen - Nfft >= inpOff
152 * <=> inpOff <= inputLen - Nfft
153 * <=> inpOff < inputLen - Nfft + 1
154 */
155
156 if ( cplxFactor == 2 )
157 {
158 const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
159 #if 0
160 printf( "*** inputLen %d, filterLen %d, Nfft %d => maxOff %d\n", inputLen, filterLen, Nfft, maxOff);
161 #endif
162 for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
163 {
164 procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
165 numOut = ( procLen - filterLen + 1 ) & ( ~1 );
166 if (!numOut)
167 break;
168 #if 0
169 if (!inpOff)
170 printf("*** inpOff = %d, numOut = %d\n", inpOff, numOut);
171 if (inpOff + filterLen + 2 >= maxOff )
172 printf("*** inpOff = %d, inpOff + numOut = %d\n", inpOff, inpOff + numOut);
173 #endif
174
175 if ( flags & PFFASTCONV_DIRECT_INP )
176 {
177 pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
178 }
179 else
180 {
181 memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
182 if ( procLen < Nfft )
183 memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
184
185 pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
186 }
187
188 pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
189
190 if ( flags & PFFASTCONV_DIRECT_OUT )
191 {
192 pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
193 }
194 else
195 {
196 pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
197 memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
198 }
199 }
200 return inpOff / cplxFactor;
201 }
202 else
203 {
204 const int maxOff = applyFlush ? (inputLen -filterLen + 1) : (inputLen - Nfft + 1);
205 const int numParts = (flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1;
206
207 for ( inpOff = 0; inpOff < maxOff; inpOff += numOut )
208 {
209 procLen = ( (inputLen - inpOff) >= Nfft ) ? Nfft : (inputLen - inpOff);
210 numOut = procLen - filterLen + 1;
211
212 for ( part = 0; part < numParts; ++part ) /* iterate per real/imag component */
213 {
214
215 if ( flags & PFFASTCONV_CPLX_INP_OUT )
216 {
217 cplxOff = 2 * inpOff + part;
218 for ( j = 0; j < procLen; ++j )
219 s->Xt[j] = X[cplxOff + 2 * j];
220 if ( procLen < Nfft )
221 memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
222
223 pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
224 }
225 else if ( flags & PFFASTCONV_DIRECT_INP )
226 {
227 pffft_transform(s->st, X + inpOff, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
228 }
229 else
230 {
231 memcpy( s->Xt, X + inpOff, (unsigned)procLen * sizeof(float) );
232 if ( procLen < Nfft )
233 memset( s->Xt + procLen, 0, (unsigned)(Nfft - procLen) * sizeof(float) );
234
235 pffft_transform(s->st, s->Xt, s->Xf, /* tmp = */ s->Mf, PFFFT_FORWARD);
236 }
237
238 pffft_zconvolve_no_accu(s->st, s->Xf, s->Hf, /* tmp = */ s->Mf, s->scale);
239
240 if ( flags & PFFASTCONV_CPLX_INP_OUT )
241 {
242 pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
243
244 cplxOff = 2 * inpOff + part;
245 for ( j = 0; j < numOut; ++j )
246 Y[ cplxOff + 2 * j ] = s->Xf[j];
247 }
248 else if ( flags & PFFASTCONV_DIRECT_OUT )
249 {
250 pffft_transform(s->st, s->Mf, Y + inpOff, s->Xf, PFFFT_BACKWARD);
251 }
252 else
253 {
254 pffft_transform(s->st, s->Mf, s->Xf, /* tmp = */ s->Xt, PFFFT_BACKWARD);
255 memcpy( Y + inpOff, s->Xf, (unsigned)numOut * sizeof(float) );
256 }
257
258 }
259 }
260
261 return inpOff;
262 }
263 }
264
265