xref: /aosp_15_r20/external/pffft/test_pffastconv.c (revision 3f1979aa0d7ad34fcf3763de7b7b8f8cd67e5bdd)
1 /*
2   Copyright (c) 2013 Julien Pommier.
3   Copyright (c) 2019  Hayati Ayguen ( [email protected] )
4  */
5 
6 #define _WANT_SNAN  1
7 
8 #include "pffft.h"
9 #include "pffastconv.h"
10 
11 #include <math.h>
12 #include <float.h>
13 #include <limits.h>
14 #include <inttypes.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <time.h>
18 #include <assert.h>
19 #include <string.h>
20 
21 #ifdef HAVE_SYS_TIMES
22 #  include <sys/times.h>
23 #  include <unistd.h>
24 #endif
25 
26 /*
27    vector support macros: the rest of the code is independant of
28    SSE/Altivec/NEON -- adding support for other platforms with 4-element
29    vectors should be limited to these macros
30 */
31 #if 0
32 #include "simd/pf_float.h"
33 #endif
34 
35 #if defined(_MSC_VER)
36 #  define RESTRICT __restrict
37 #elif defined(__GNUC__)
38 #  define RESTRICT __restrict
39 #else
40 #  define RESTRICT
41 #endif
42 
43 
44 #if defined(_MSC_VER)
45 #pragma warning( disable : 4244 )
46 #endif
47 
48 
49 #ifdef SNANF
50   #define INVALID_FLOAT_VAL  SNANF
51 #elif defined(SNAN)
52   #define INVALID_FLOAT_VAL  SNAN
53 #elif defined(NAN)
54   #define INVALID_FLOAT_VAL  NAN
55 #elif defined(INFINITY)
56   #define INVALID_FLOAT_VAL  INFINITY
57 #else
58   #define INVALID_FLOAT_VAL  FLT_MAX
59 #endif
60 
61 
62 #if defined(HAVE_SYS_TIMES)
uclock_sec(void)63   inline double uclock_sec(void) {
64     static double ttclk = 0.;
65     struct tms t;
66     if (ttclk == 0.)
67       ttclk = sysconf(_SC_CLK_TCK);
68     times(&t);
69     /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
70     return ((double)t.tms_utime)) / ttclk;
71   }
72 # else
73   double uclock_sec(void)
74 { return (double)clock()/(double)CLOCKS_PER_SEC; }
75 #endif
76 
77 
78 
79 typedef int            (*pfnConvolution)  (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush);
80 typedef void*          (*pfnConvSetup)    (float *Hfwd, int Nf, int * BlkLen, int flags);
81 typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup);
82 typedef void           (*pfnConvDestroy)  (void * setup);
83 
84 
85 struct ConvSetup
86 {
87   pfnConvolution pfn;
88   int N;
89   int B;
90   float * H;
91   int flags;
92 };
93 
94 
95 void * convSetupRev( float * H, int N, int * BlkLen, int flags )
96 {
97   struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) );
98   int i, Nr = N;
99   if (flags & PFFASTCONV_CPLX_INP_OUT)
100     Nr *= 2;
101   Nr += 4;
102   s->pfn = NULL;
103   s->N = N;
104   s->B = *BlkLen;
105   s->H = pffastconv_malloc((unsigned)Nr * sizeof(float));
106   s->flags = flags;
107   memset(s->H, 0, (unsigned)Nr * sizeof(float));
108   if (flags & PFFASTCONV_CPLX_INP_OUT)
109   {
110     for ( i = 0; i < N; ++i ) {
111       s->H[2*(N-1 -i)  ] = H[i];
112       s->H[2*(N-1 -i)+1] = H[i];
113     }
114     /* simpler detection of overruns */
115     s->H[ 2*N    ] = INVALID_FLOAT_VAL;
116     s->H[ 2*N +1 ] = INVALID_FLOAT_VAL;
117     s->H[ 2*N +2 ] = INVALID_FLOAT_VAL;
118     s->H[ 2*N +3 ] = INVALID_FLOAT_VAL;
119   }
120   else
121   {
122     for ( i = 0; i < N; ++i )
123       s->H[ N-1 -i ] = H[i];
124     /* simpler detection of overruns */
125     s->H[ N    ] = INVALID_FLOAT_VAL;
126     s->H[ N +1 ] = INVALID_FLOAT_VAL;
127     s->H[ N +2 ] = INVALID_FLOAT_VAL;
128     s->H[ N +3 ] = INVALID_FLOAT_VAL;
129   }
130   return s;
131 }
132 
133 void convDestroyRev( void * setup )
134 {
135   struct ConvSetup * s = (struct ConvSetup*)setup;
136   pffastconv_free(s->H);
137   pffastconv_free(setup);
138 }
139 
140 
141 pfnConvolution ConvGetFnPtrRev( void * setup )
142 {
143   struct ConvSetup * s = (struct ConvSetup*)setup;
144   if (!s)
145     return NULL;
146   return s->pfn;
147 }
148 
149 
150 void convSimdDestroy( void * setup )
151 {
152   convDestroyRev(setup);
153 }
154 
155 
156 void * fastConvSetup( float * H, int N, int * BlkLen, int flags )
157 {
158   void * p = pffastconv_new_setup( H, N, BlkLen, flags );
159   if (!p)
160     printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags);
161   return p;
162 }
163 
164 
165 void fastConvDestroy( void * setup )
166 {
167   pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup );
168 }
169 
170 
171 
172 int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
173 {
174   struct ConvSetup * p = (struct ConvSetup*)setup;
175   const float * RESTRICT X = input;
176   const float * RESTRICT Hrev = p->H;
177   float * RESTRICT Y = output;
178   const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
179   const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
180   int i, j;
181   (void)Yref;
182   (void)applyFlush;
183 
184   if (p->flags & PFFASTCONV_CPLX_INP_OUT)
185   {
186     for ( i = 0; i <= lenNr; i += 2 )
187     {
188       float sumRe = 0.0F, sumIm = 0.0F;
189       for ( j = 0; j < Nr; j += 2 )
190       {
191         sumRe += X[i+j  ] * Hrev[j];
192         sumIm += X[i+j+1] * Hrev[j+1];
193       }
194       Y[i  ] = sumRe;
195       Y[i+1] = sumIm;
196     }
197     return i/2;
198   }
199   else
200   {
201     for ( i = 0; i <= lenNr; ++i )
202     {
203       float sum = 0.0F;
204       for (j = 0; j < Nr; ++j )
205         sum += X[i+j]   * Hrev[j];
206       Y[i] = sum;
207     }
208     return i;
209   }
210 }
211 
212 
213 
214 int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
215 {
216   float sum[4];
217   struct ConvSetup * p = (struct ConvSetup*)setup;
218   const float * RESTRICT X = input;
219   const float * RESTRICT Hrev = p->H;
220   float * RESTRICT Y = output;
221   const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
222   const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
223   int i, j;
224   (void)Yref;
225   (void)applyFlush;
226 
227   if (p->flags & PFFASTCONV_CPLX_INP_OUT)
228   {
229     if ( (Nr & 3) == 0 )
230     {
231       for ( i = 0; i <= lenNr; i += 2 )
232       {
233         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
234         for (j = 0; j < Nr; j += 4 )
235         {
236           sum[0] += X[i+j]   * Hrev[j];
237           sum[1] += X[i+j+1] * Hrev[j+1];
238           sum[2] += X[i+j+2] * Hrev[j+2];
239           sum[3] += X[i+j+3] * Hrev[j+3];
240         }
241         Y[i  ] = sum[0] + sum[2];
242         Y[i+1] = sum[1] + sum[3];
243       }
244     }
245     else
246     {
247       const int M = Nr & (~3);
248       for ( i = 0; i <= lenNr; i += 2 )
249       {
250         float tailSumRe = 0.0F, tailSumIm = 0.0F;
251         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
252         for (j = 0; j < M; j += 4 )
253         {
254           sum[0] += X[i+j  ] * Hrev[j  ];
255           sum[1] += X[i+j+1] * Hrev[j+1];
256           sum[2] += X[i+j+2] * Hrev[j+2];
257           sum[3] += X[i+j+3] * Hrev[j+3];
258         }
259         for ( ; j < Nr; j += 2 ) {
260           tailSumRe += X[i+j  ] * Hrev[j  ];
261           tailSumIm += X[i+j+1] * Hrev[j+1];
262         }
263         Y[i  ] = ( sum[0] + sum[2] ) + tailSumRe;
264         Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm;
265       }
266     }
267     return i/2;
268   }
269   else
270   {
271     if ( (Nr & 3) == 0 )
272     {
273       for ( i = 0; i <= lenNr; ++i )
274       {
275         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
276         for (j = 0; j < Nr; j += 4 )
277         {
278           sum[0] += X[i+j]   * Hrev[j];
279           sum[1] += X[i+j+1] * Hrev[j+1];
280           sum[2] += X[i+j+2] * Hrev[j+2];
281           sum[3] += X[i+j+3] * Hrev[j+3];
282         }
283         Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
284       }
285       return i;
286     }
287     else
288     {
289       const int M = Nr & (~3);
290       /* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */
291       for ( i = 0; i <= lenNr; ++i )
292       {
293         float tailSum = 0.0;
294         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
295         for (j = 0; j < M; j += 4 )
296         {
297           sum[0] += X[i+j]   * Hrev[j];
298           sum[1] += X[i+j+1] * Hrev[j+1];
299           sum[2] += X[i+j+2] * Hrev[j+2];
300           sum[3] += X[i+j+3] * Hrev[j+3];
301         }
302         for ( ; j < Nr; ++j )
303           tailSum += X[i+j] * Hrev[j];
304         Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
305       }
306       return i;
307     }
308   }
309 }
310 
311 
312 int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush)
313 {
314   float sum[4];
315   struct ConvSetup * p = (struct ConvSetup*)setup;
316   (void)Yref;
317   (void)applyFlush;
318   if (p->flags & PFFASTCONV_SYMMETRIC)
319   {
320     const float * RESTRICT X = input;
321     const float * RESTRICT Hrev = p->H;
322     float * RESTRICT Y = output;
323     const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
324     const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
325     const int h = Nr / 2 -4;
326     const int E = Nr -4;
327     int i, j;
328 
329     if (p->flags & PFFASTCONV_CPLX_INP_OUT)
330     {
331       for ( i = 0; i <= lenNr; i += 2 )
332       {
333         const int k = i + E;
334         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
335         for (j = 0; j <= h; j += 4 )
336         {
337           sum[0] += Hrev[j  ] * ( X[i+j  ] + X[k-j+2] );
338           sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] );
339           sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j  ] );
340           sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] );
341         }
342         Y[i  ] = sum[0] + sum[2];
343         Y[i+1] = sum[1] + sum[3];
344       }
345       return i/2;
346     }
347     else
348     {
349       for ( i = 0; i <= lenNr; ++i )
350       {
351         const int k = i + E;
352         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
353         for (j = 0; j <= h; j += 4 )
354         {
355           sum[0] += Hrev[j  ] * ( X[i+j  ] + X[k-j+3] );
356           sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] );
357           sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] );
358           sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j  ] );
359         }
360         Y[i] = sum[0] + sum[1] + sum[2] + sum[3];
361       }
362       return i;
363     }
364   }
365   else
366   {
367     const float * RESTRICT X = input;
368     const float * RESTRICT Hrev = p->H;
369     float * RESTRICT Y = output;
370     const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N;
371     const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N);
372     int i, j;
373 
374     if (p->flags & PFFASTCONV_CPLX_INP_OUT)
375     {
376       for ( i = 0; i <= lenNr; i += 2 )
377       {
378         sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
379         for (j = 0; j < Nr; j += 4 )
380         {
381           sum[0] += X[i+j]   * Hrev[j];
382           sum[1] += X[i+j+1] * Hrev[j+1];
383           sum[2] += X[i+j+2] * Hrev[j+2];
384           sum[3] += X[i+j+3] * Hrev[j+3];
385         }
386         Y[i  ] = sum[0] + sum[2];
387         Y[i+1] = sum[1] + sum[3];
388       }
389       return i/2;
390     }
391     else
392     {
393       if ( (Nr & 3) == 0 )
394       {
395         for ( i = 0; i <= lenNr; ++i )
396         {
397           sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
398           for (j = 0; j < Nr; j += 4 )
399           {
400             sum[0] += X[i+j]   * Hrev[j];
401             sum[1] += X[i+j+1] * Hrev[j+1];
402             sum[2] += X[i+j+2] * Hrev[j+2];
403             sum[3] += X[i+j+3] * Hrev[j+3];
404           }
405           Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]);
406         }
407         return i;
408       }
409       else
410       {
411         const int M = Nr & (~3);
412         /* printf("B: Nr = %d\n", Nr ); */
413         for ( i = 0; i <= lenNr; ++i )
414         {
415           float tailSum = 0.0;
416           sum[0] = sum[1] = sum[2] = sum[3] = 0.0F;
417           for (j = 0; j < M; j += 4 )
418           {
419             sum[0] += X[i+j]   * Hrev[j];
420             sum[1] += X[i+j+1] * Hrev[j+1];
421             sum[2] += X[i+j+2] * Hrev[j+2];
422             sum[3] += X[i+j+3] * Hrev[j+3];
423           }
424           for ( ; j < Nr; ++j )
425             tailSum += X[i+j] * Hrev[j];
426           Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum;
427         }
428         return i;
429       }
430     }
431   }
432 
433 }
434 
435 
436 int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush)
437 {
438   (void)Yref;
439   return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush );
440 }
441 
442 
443 
444 void printFirst( const float * V, const char * st, const int N, const int perLine )
445 {
446   (void)V;  (void)st;  (void)N;  (void)perLine;
447   return;
448 #if 0
449   int i;
450   for ( i = 0; i < N; ++i )
451   {
452     if ( (i % perLine) == 0 )
453       printf("\n%s[%d]", st, i);
454     printf("\t%.1f", V[i]);
455   }
456   printf("\n");
457 #endif
458 }
459 
460 
461 
462 #define NUMY       11
463 
464 
465 int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed) {
466   double t0, t1, tstop, td, tdref;
467   float *X, *H;
468   float *Y[NUMY];
469   int64_t outN[NUMY];
470   /* 256 KFloats or 16 MFloats data */
471 #if 1
472   const int len = testOutLen ? (1 << 18) : (1 << 24);
473 #elif 0
474   const int len = testOutLen ? (1 << 18) : (1 << 13);
475 #else
476   const int len = testOutLen ? (1 << 18) : (1024);
477 #endif
478   const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1;
479   const int lenC = len / cplxFactor;
480 
481   int yi, yc, posMaxErr;
482   float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0;
483   int i, j, numErrOverLimit, iter;
484   int retErr = 0;
485 
486   /*                                  0               1               2               3                   4                   5                   6                   7                   8                      9  */
487   pfnConvSetup   aSetup[NUMY]     = { convSetupRev,   convSetupRev,   convSetupRev,   fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,      fastConvSetup,         fastConvSetup   };
488   pfnConvDestroy aDestroy[NUMY]   = { convDestroyRev, convDestroyRev, convDestroyRev, fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,    fastConvDestroy,       fastConvDestroy };
489   pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL,           NULL,           NULL,           NULL,               NULL,               NULL,               NULL,               NULL,               NULL,                  NULL,           };
490   pfnConvolution aConv[NUMY]      = { slow_conv_R,    slow_conv_A,    slow_conv_B,    fast_conv,          fast_conv,          fast_conv,          fast_conv,          fast_conv,          fast_conv,             fast_conv       };
491   const char * convText[NUMY]     = { "R(non-simd)",  "A(non-simd)",  "B(non-simd)",  "fast_conv_64",     "fast_conv_128",    "fast_conv_256",    "fast_conv_512",    "fast_conv_1K",     "fast_conv_2K",        "fast_conv_4K"  };
492   int    aFastAlgo[NUMY]          = { 0,              0,              0,              1,                  1,                  1,                  1,                  1,                  1,                     1               };
493   void * aSetupCfg[NUMY]          = { NULL,           NULL,           NULL,           NULL,               NULL,               NULL,               NULL,               NULL,               NULL,                  NULL            };
494   int    aBlkLen[NUMY]            = { 1024,           1024,           1024,           64,                 128,                256,                512,                1024,               2048,                  4096            };
495 #if 1
496   int    aRunAlgo[NUMY]           = { 1,              1,              1,              FILTERLEN<64,       FILTERLEN<128,      FILTERLEN<256,      FILTERLEN<512,      FILTERLEN<1024,     FILTERLEN<2048,        FILTERLEN<4096  };
497 #elif 0
498   int    aRunAlgo[NUMY]           = { 1,              0,              0,              0 && FILTERLEN<64,  1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048,  0 && FILTERLEN<4096  };
499 #else
500   int    aRunAlgo[NUMY]           = { 1,              1,              1,              0 && FILTERLEN<64,  0 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048,  0 && FILTERLEN<4096  };
501 #endif
502   double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY];
503 
504   X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) );
505   for ( i=0; i < NUMY; ++i)
506   {
507     if ( 1 || i < 2 )
508       Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) );
509     else
510       Y[i] = Y[1];
511 
512     Y[i][0] = 123.F;  /* test for pffft_zconvolve_no_accu() */
513     aSpeedFactor[i] = -1.0;
514     aDuration[i] = -1.0;
515     procSmpPerSec[i] = -1.0;
516   }
517 
518   H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float));
519 
520   /* initialize input */
521   if ( convFlags & PFFASTCONV_CPLX_INP_OUT )
522   {
523     for ( i = 0; i < lenC; ++i )
524     {
525       X[2*i  ] = (float)(i % 4093);  /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
526       X[2*i+1] = (float)((i+2048) % 4093);
527     }
528   }
529   else
530   {
531     for ( i = 0; i < len; ++i )
532       X[i] = (float)(i % 4093);  /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */
533   }
534   X[ len    ] = INVALID_FLOAT_VAL;
535   X[ len +1 ] = INVALID_FLOAT_VAL;
536   X[ len +2 ] = INVALID_FLOAT_VAL;
537   X[ len +3 ] = INVALID_FLOAT_VAL;
538 
539   if (!testOutLen)
540     printFirst( X, "X", 64, 8 );
541 
542   /* filter coeffs */
543   memset( H, 0, FILTERLEN * sizeof(float) );
544 #if 1
545   if ( convFlags & PFFASTCONV_SYMMETRIC )
546   {
547     const int half = FILTERLEN / 2;
548     for ( j = 0; j < half; ++j ) {
549       switch (j % 3) {
550         case 0: H[j] = H[FILTERLEN-1-j] = -1.0F;  break;
551         case 1: H[j] = H[FILTERLEN-1-j] =  1.0F;  break;
552         case 2: H[j] = H[FILTERLEN-1-j] =  0.5F;  break;
553       }
554     }
555   }
556   else
557   {
558     for ( j = 0; j < FILTERLEN; ++j ) {
559       switch (j % 3) {
560         case 0: H[j] = -1.0F;  break;
561         case 1: H[j] = 1.0F;   break;
562         case 2: H[j] = 0.5F;   break;
563       }
564     }
565   }
566 #else
567   H[0] = 1.0F;
568   H[FILTERLEN -1] = 1.0F;
569 #endif
570   if (!testOutLen)
571     printFirst( H, "H", FILTERLEN, 8 );
572 
573   printf("\n");
574   printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN,
575     ((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"),
576     (convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "",
577     ((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") );
578 
579   while (1)
580   {
581 
582     for ( yi = 0; yi < NUMY; ++yi )
583     {
584       if (!aRunAlgo[yi])
585         continue;
586 
587       aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags );
588 
589       /* get effective apply function ptr */
590       if ( aSetupCfg[yi] && aGetFnPtr[yi] )
591         aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] );
592 
593       if ( aSetupCfg[yi] && aConv[yi] ) {
594         if (testOutLen)
595         {
596           t0 = uclock_sec();
597           outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ );
598           t1 = uclock_sec();
599           td = t1 - t0;
600         }
601         else
602         {
603           const int blkLen = 4096;  /* required for 'fast_conv_4K' */
604           int64_t offC = 0, offS, Nout;
605           int k;
606           iter = 0;
607           outN[yi] = 0;
608           t0 = uclock_sec();
609           tstop = t0 + 0.25;  /* benchmark duration: 250 ms */
610           do {
611             for ( k = 0; k < 128 && offC +blkLen < lenC; ++k )
612             {
613               offS = cplxFactor * offC;
614               Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], (offC +blkLen >= lenC) /* applyFlush */ );
615               offC += Nout;
616               ++iter;
617               if ( !Nout )
618                 break;
619               if ( offC +blkLen >= lenC )
620               {
621                 outN[yi] += offC;
622                 offC = 0;
623               }
624             }
625             t1 = uclock_sec();
626           } while ( t1 < tstop );
627           outN[yi] += offC;
628           td = t1 - t0;
629           procSmpPerSec[yi] = cplxFactor * (double)outN[yi] / td;
630         }
631       }
632       else
633       {
634         t0 = t1 = td = 0.0;
635         outN[yi] = 0;
636       }
637       aDuration[yi] = td;
638       if ( yi == 0 ) {
639         const float * Yvals = Y[0];
640         const int64_t refOutLen = cplxFactor * outN[0];
641         tdref = td;
642         if (printDbg) {
643           printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0);
644           printf("  convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
645         }
646         aSpeedFactor[yi] = 1.0;
647         /*  */
648         yRangeMin = FLT_MAX;
649         yRangeMax = FLT_MIN;
650         for ( i = 0; i < refOutLen; ++i )
651         {
652           if ( yRangeMax < Yvals[i] )  yRangeMax = Yvals[i];
653           if ( yRangeMin > Yvals[i] )  yRangeMin = Yvals[i];
654         }
655         yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F );
656         /* yErrLimit = 0.01F; */
657         if (testOutLen) {
658           if (1) {
659             printf("reference output len = %" PRId64 " smp\n", outN[0]);
660             printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit);
661           }
662           printFirst( Yvals, "Yref", 64, 8 );
663         }
664       }
665       else
666       {
667         aSpeedFactor[yi] = tdref / td;
668         if (printDbg) {
669           printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td);
670           printf("  convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor);
671         }
672       }
673     }
674 
675     int iMaxSpeedSlowAlgo = -1;
676     int iFirstFastAlgo = -1;
677     int iMaxSpeedFastAlgo = -1;
678     int iPrintedRefOutLen = 0;
679     {
680       for ( yc = 1; yc < NUMY; ++yc )
681       {
682         if (!aRunAlgo[yc])
683           continue;
684         if (aFastAlgo[yc]) {
685           if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] )
686             iMaxSpeedFastAlgo = yc;
687 
688           if (iFirstFastAlgo < 0)
689             iFirstFastAlgo = yc;
690         }
691         else
692         {
693           if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] )
694             iMaxSpeedSlowAlgo = yc;
695         }
696       }
697 
698       if (printSpeed)
699       {
700         if (testOutLen)
701         {
702           if (iMaxSpeedSlowAlgo >= 0 )
703             printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]);
704           if (0 != iMaxSpeedSlowAlgo && aRunAlgo[0])
705             printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]);
706           if (1 != iMaxSpeedSlowAlgo && aRunAlgo[1])
707             printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]);
708 
709           if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo])
710             printf("first   fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo],    aSpeedFactor[iFirstFastAlgo],    1000.0 * aDuration[iFirstFastAlgo]);
711           if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1])
712             printf("2nd     fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1],  aSpeedFactor[iFirstFastAlgo+1],  1000.0 * aDuration[iFirstFastAlgo+1]);
713 
714           if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] )
715           {
716             printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]);
717             if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] )
718               printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] );
719           }
720           printf("\n");
721         }
722         else
723         {
724           for ( yc = 0; yc < NUMY; ++yc )
725           {
726             if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0)
727               continue;
728             printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\n",
729               convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] * 0.001 );
730           }
731         }
732 
733       }
734     }
735 
736 
737     for ( yc = 1; yc < NUMY; ++yc )
738     {
739       const float * Yref;
740       const float * Ycurr;
741       int outMin;
742 
743       if (!aRunAlgo[yc])
744         continue;
745 
746       if (printDbg)
747         printf("\n");
748 
749       if ( outN[yc] == 0 )
750       {
751         printf("output size 0: '%s' not implemented\n", convText[yc]);
752       }
753       else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen )
754       {
755         if (!iPrintedRefOutLen)
756         {
757           printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]);
758           iPrintedRefOutLen = 1;
759         }
760         printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n",
761           FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] );
762         retErr = 1;
763       }
764 
765       posMaxErr = 0;
766       maxErr = -1.0;
767       Yref = Y[0];
768       Ycurr = Y[yc];
769       outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0];
770       numErrOverLimit = 0;
771       for ( i = 0; i < outMin; ++i )
772       {
773         if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit )
774         {
775           printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n",
776             convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] );
777           ++numErrOverLimit;
778         }
779 
780         if ( fabs(Ycurr[i] - Yref[i]) > maxErr )
781         {
782           maxErr = fabsf(Ycurr[i] - Yref[i]);
783           posMaxErr = i;
784         }
785       }
786 
787       if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) )
788         printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 );
789     }
790 
791     break;
792   }
793 
794   pffastconv_free(X);
795   for ( i=0; i < NUMY; ++i)
796   {
797     if ( 1 || i < 2 )
798       pffastconv_free( Y[i] );
799     if (!aRunAlgo[i])
800       continue;
801     aDestroy[i]( aSetupCfg[i] );
802   }
803 
804   pffastconv_free(H);
805 
806   return retErr;
807 }
808 
809 /* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
810 void validate_pffft_simd();
811 int  validate_pffft_simd_ex(FILE * DbgOut);
812 
813 
814 int main(int argc, char **argv)
815 {
816   int result = 0;
817   int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed;
818   int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0;
819   int testReal = 1, testCplx = 1, testSymetric = 0;
820 
821   for ( i = 1; i < argc; ++i ) {
822 
823     if (!strcmp(argv[i], "--test-simd")) {
824       int numErrs = validate_pffft_simd_ex(stdout);
825       fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs);
826       return ( numErrs > 0 ? 1 : 0 );
827     }
828 
829     if (!strcmp(argv[i], "--no-len")) {
830       testOutLens = 0;
831     }
832     else if (!strcmp(argv[i], "--no-bench")) {
833       benchConv = 0;
834     }
835     else if (!strcmp(argv[i], "--quick")) {
836       quickTest = 1;
837     }
838     else if (!strcmp(argv[i], "--slow")) {
839       slowTest = 1;
840     }
841     else if (!strcmp(argv[i], "--real")) {
842       testCplx = 0;
843     }
844     else if (!strcmp(argv[i], "--cplx")) {
845       testReal = 0;
846     }
847     else if (!strcmp(argv[i], "--sym")) {
848       testSymetric = 1;
849     }
850     else /* if (!strcmp(argv[i], "--help")) */ {
851       printf("usage: %s [--test-simd] [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]);
852       exit(1);
853     }
854   }
855 
856 
857   if (testOutLens)
858   {
859     for ( k = 0; k < 3; ++k )
860     {
861       if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
862         continue;
863       printf("\n\n==========\n");
864       printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
865       printf("==========\n");
866       flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
867       flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
868       flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT;
869       testOutLen = 1;
870       printDbg = 0;
871       printSpeed = 0;
872       for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M )
873       {
874         if ( (M % 16) != 0 && testSymetric )
875           continue;
876         result |= test(M, flagsB, testOutLen, printDbg, printSpeed);
877       }
878     }
879   }
880 
881   if (benchConv)
882   {
883     for ( k = 0; k < 3; ++k )
884     {
885       if ( (k == 0 && !testReal) || (k > 0 && !testCplx) )
886         continue;
887       printf("\n\n==========\n");
888       printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) );
889       printf("==========\n");
890       flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT;
891       flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 );
892       flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 );
893       testOutLen = 0;
894       printDbg = 0;
895       printSpeed = 1;
896       if (!slowTest) {
897         result |= test( 32,     flagsC, testOutLen, printDbg, printSpeed);
898         result |= test( 32+ 16, flagsC, testOutLen, printDbg, printSpeed);
899         result |= test( 64,     flagsC, testOutLen, printDbg, printSpeed);
900         result |= test( 64+ 32, flagsC, testOutLen, printDbg, printSpeed);
901         result |= test(128,     flagsC, testOutLen, printDbg, printSpeed);
902       }
903       if (!quickTest) {
904         result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed);
905         result |= test(256,     flagsC, testOutLen, printDbg, printSpeed);
906         result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed);
907         result |= test(512,     flagsC, testOutLen, printDbg, printSpeed);
908         result |= test(1024,    flagsC, testOutLen, printDbg, printSpeed);
909       }
910     }
911   }
912 
913   return result;
914 }
915 
916