1 /* 2 Copyright (c) 2013 Julien Pommier. 3 Copyright (c) 2019 Hayati Ayguen ( [email protected] ) 4 */ 5 6 #define _WANT_SNAN 1 7 8 #include "pffft.h" 9 #include "pffastconv.h" 10 11 #include <math.h> 12 #include <float.h> 13 #include <limits.h> 14 #include <inttypes.h> 15 #include <stdio.h> 16 #include <stdlib.h> 17 #include <time.h> 18 #include <assert.h> 19 #include <string.h> 20 21 #ifdef HAVE_SYS_TIMES 22 # include <sys/times.h> 23 # include <unistd.h> 24 #endif 25 26 /* 27 vector support macros: the rest of the code is independant of 28 SSE/Altivec/NEON -- adding support for other platforms with 4-element 29 vectors should be limited to these macros 30 */ 31 #if 0 32 #include "simd/pf_float.h" 33 #endif 34 35 #if defined(_MSC_VER) 36 # define RESTRICT __restrict 37 #elif defined(__GNUC__) 38 # define RESTRICT __restrict 39 #else 40 # define RESTRICT 41 #endif 42 43 44 #if defined(_MSC_VER) 45 #pragma warning( disable : 4244 ) 46 #endif 47 48 49 #ifdef SNANF 50 #define INVALID_FLOAT_VAL SNANF 51 #elif defined(SNAN) 52 #define INVALID_FLOAT_VAL SNAN 53 #elif defined(NAN) 54 #define INVALID_FLOAT_VAL NAN 55 #elif defined(INFINITY) 56 #define INVALID_FLOAT_VAL INFINITY 57 #else 58 #define INVALID_FLOAT_VAL FLT_MAX 59 #endif 60 61 62 #if defined(HAVE_SYS_TIMES) uclock_sec(void)63 inline double uclock_sec(void) { 64 static double ttclk = 0.; 65 struct tms t; 66 if (ttclk == 0.) 67 ttclk = sysconf(_SC_CLK_TCK); 68 times(&t); 69 /* use only the user time of this process - not realtime, which depends on OS-scheduler .. */ 70 return ((double)t.tms_utime)) / ttclk; 71 } 72 # else 73 double uclock_sec(void) 74 { return (double)clock()/(double)CLOCKS_PER_SEC; } 75 #endif 76 77 78 79 typedef int (*pfnConvolution) (void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush); 80 typedef void* (*pfnConvSetup) (float *Hfwd, int Nf, int * BlkLen, int flags); 81 typedef pfnConvolution (*pfnGetConvFnPtr) (void * setup); 82 typedef void (*pfnConvDestroy) (void * setup); 83 84 85 struct ConvSetup 86 { 87 pfnConvolution pfn; 88 int N; 89 int B; 90 float * H; 91 int flags; 92 }; 93 94 95 void * convSetupRev( float * H, int N, int * BlkLen, int flags ) 96 { 97 struct ConvSetup * s = pffastconv_malloc( sizeof(struct ConvSetup) ); 98 int i, Nr = N; 99 if (flags & PFFASTCONV_CPLX_INP_OUT) 100 Nr *= 2; 101 Nr += 4; 102 s->pfn = NULL; 103 s->N = N; 104 s->B = *BlkLen; 105 s->H = pffastconv_malloc((unsigned)Nr * sizeof(float)); 106 s->flags = flags; 107 memset(s->H, 0, (unsigned)Nr * sizeof(float)); 108 if (flags & PFFASTCONV_CPLX_INP_OUT) 109 { 110 for ( i = 0; i < N; ++i ) { 111 s->H[2*(N-1 -i) ] = H[i]; 112 s->H[2*(N-1 -i)+1] = H[i]; 113 } 114 /* simpler detection of overruns */ 115 s->H[ 2*N ] = INVALID_FLOAT_VAL; 116 s->H[ 2*N +1 ] = INVALID_FLOAT_VAL; 117 s->H[ 2*N +2 ] = INVALID_FLOAT_VAL; 118 s->H[ 2*N +3 ] = INVALID_FLOAT_VAL; 119 } 120 else 121 { 122 for ( i = 0; i < N; ++i ) 123 s->H[ N-1 -i ] = H[i]; 124 /* simpler detection of overruns */ 125 s->H[ N ] = INVALID_FLOAT_VAL; 126 s->H[ N +1 ] = INVALID_FLOAT_VAL; 127 s->H[ N +2 ] = INVALID_FLOAT_VAL; 128 s->H[ N +3 ] = INVALID_FLOAT_VAL; 129 } 130 return s; 131 } 132 133 void convDestroyRev( void * setup ) 134 { 135 struct ConvSetup * s = (struct ConvSetup*)setup; 136 pffastconv_free(s->H); 137 pffastconv_free(setup); 138 } 139 140 141 pfnConvolution ConvGetFnPtrRev( void * setup ) 142 { 143 struct ConvSetup * s = (struct ConvSetup*)setup; 144 if (!s) 145 return NULL; 146 return s->pfn; 147 } 148 149 150 void convSimdDestroy( void * setup ) 151 { 152 convDestroyRev(setup); 153 } 154 155 156 void * fastConvSetup( float * H, int N, int * BlkLen, int flags ) 157 { 158 void * p = pffastconv_new_setup( H, N, BlkLen, flags ); 159 if (!p) 160 printf("fastConvSetup(N = %d, *BlkLen = %d, flags = %d) = NULL\n", N, *BlkLen, flags); 161 return p; 162 } 163 164 165 void fastConvDestroy( void * setup ) 166 { 167 pffastconv_destroy_setup( (PFFASTCONV_Setup*)setup ); 168 } 169 170 171 172 int slow_conv_R(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush) 173 { 174 struct ConvSetup * p = (struct ConvSetup*)setup; 175 const float * RESTRICT X = input; 176 const float * RESTRICT Hrev = p->H; 177 float * RESTRICT Y = output; 178 const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N; 179 const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N); 180 int i, j; 181 (void)Yref; 182 (void)applyFlush; 183 184 if (p->flags & PFFASTCONV_CPLX_INP_OUT) 185 { 186 for ( i = 0; i <= lenNr; i += 2 ) 187 { 188 float sumRe = 0.0F, sumIm = 0.0F; 189 for ( j = 0; j < Nr; j += 2 ) 190 { 191 sumRe += X[i+j ] * Hrev[j]; 192 sumIm += X[i+j+1] * Hrev[j+1]; 193 } 194 Y[i ] = sumRe; 195 Y[i+1] = sumIm; 196 } 197 return i/2; 198 } 199 else 200 { 201 for ( i = 0; i <= lenNr; ++i ) 202 { 203 float sum = 0.0F; 204 for (j = 0; j < Nr; ++j ) 205 sum += X[i+j] * Hrev[j]; 206 Y[i] = sum; 207 } 208 return i; 209 } 210 } 211 212 213 214 int slow_conv_A(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush) 215 { 216 float sum[4]; 217 struct ConvSetup * p = (struct ConvSetup*)setup; 218 const float * RESTRICT X = input; 219 const float * RESTRICT Hrev = p->H; 220 float * RESTRICT Y = output; 221 const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N; 222 const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N); 223 int i, j; 224 (void)Yref; 225 (void)applyFlush; 226 227 if (p->flags & PFFASTCONV_CPLX_INP_OUT) 228 { 229 if ( (Nr & 3) == 0 ) 230 { 231 for ( i = 0; i <= lenNr; i += 2 ) 232 { 233 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; 234 for (j = 0; j < Nr; j += 4 ) 235 { 236 sum[0] += X[i+j] * Hrev[j]; 237 sum[1] += X[i+j+1] * Hrev[j+1]; 238 sum[2] += X[i+j+2] * Hrev[j+2]; 239 sum[3] += X[i+j+3] * Hrev[j+3]; 240 } 241 Y[i ] = sum[0] + sum[2]; 242 Y[i+1] = sum[1] + sum[3]; 243 } 244 } 245 else 246 { 247 const int M = Nr & (~3); 248 for ( i = 0; i <= lenNr; i += 2 ) 249 { 250 float tailSumRe = 0.0F, tailSumIm = 0.0F; 251 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; 252 for (j = 0; j < M; j += 4 ) 253 { 254 sum[0] += X[i+j ] * Hrev[j ]; 255 sum[1] += X[i+j+1] * Hrev[j+1]; 256 sum[2] += X[i+j+2] * Hrev[j+2]; 257 sum[3] += X[i+j+3] * Hrev[j+3]; 258 } 259 for ( ; j < Nr; j += 2 ) { 260 tailSumRe += X[i+j ] * Hrev[j ]; 261 tailSumIm += X[i+j+1] * Hrev[j+1]; 262 } 263 Y[i ] = ( sum[0] + sum[2] ) + tailSumRe; 264 Y[i+1] = ( sum[1] + sum[3] ) + tailSumIm; 265 } 266 } 267 return i/2; 268 } 269 else 270 { 271 if ( (Nr & 3) == 0 ) 272 { 273 for ( i = 0; i <= lenNr; ++i ) 274 { 275 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; 276 for (j = 0; j < Nr; j += 4 ) 277 { 278 sum[0] += X[i+j] * Hrev[j]; 279 sum[1] += X[i+j+1] * Hrev[j+1]; 280 sum[2] += X[i+j+2] * Hrev[j+2]; 281 sum[3] += X[i+j+3] * Hrev[j+3]; 282 } 283 Y[i] = sum[0] + sum[1] + sum[2] + sum[3]; 284 } 285 return i; 286 } 287 else 288 { 289 const int M = Nr & (~3); 290 /* printf("A: Nr = %d, M = %d, H[M] = %f, H[M+1] = %f, H[M+2] = %f, H[M+3] = %f\n", Nr, M, Hrev[M], Hrev[M+1], Hrev[M+2], Hrev[M+3] ); */ 291 for ( i = 0; i <= lenNr; ++i ) 292 { 293 float tailSum = 0.0; 294 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; 295 for (j = 0; j < M; j += 4 ) 296 { 297 sum[0] += X[i+j] * Hrev[j]; 298 sum[1] += X[i+j+1] * Hrev[j+1]; 299 sum[2] += X[i+j+2] * Hrev[j+2]; 300 sum[3] += X[i+j+3] * Hrev[j+3]; 301 } 302 for ( ; j < Nr; ++j ) 303 tailSum += X[i+j] * Hrev[j]; 304 Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum; 305 } 306 return i; 307 } 308 } 309 } 310 311 312 int slow_conv_B(void * setup, const float * input, int len, float *output, const float *Yref, int applyFlush) 313 { 314 float sum[4]; 315 struct ConvSetup * p = (struct ConvSetup*)setup; 316 (void)Yref; 317 (void)applyFlush; 318 if (p->flags & PFFASTCONV_SYMMETRIC) 319 { 320 const float * RESTRICT X = input; 321 const float * RESTRICT Hrev = p->H; 322 float * RESTRICT Y = output; 323 const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N; 324 const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N); 325 const int h = Nr / 2 -4; 326 const int E = Nr -4; 327 int i, j; 328 329 if (p->flags & PFFASTCONV_CPLX_INP_OUT) 330 { 331 for ( i = 0; i <= lenNr; i += 2 ) 332 { 333 const int k = i + E; 334 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; 335 for (j = 0; j <= h; j += 4 ) 336 { 337 sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+2] ); 338 sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+3] ); 339 sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j ] ); 340 sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j+1] ); 341 } 342 Y[i ] = sum[0] + sum[2]; 343 Y[i+1] = sum[1] + sum[3]; 344 } 345 return i/2; 346 } 347 else 348 { 349 for ( i = 0; i <= lenNr; ++i ) 350 { 351 const int k = i + E; 352 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; 353 for (j = 0; j <= h; j += 4 ) 354 { 355 sum[0] += Hrev[j ] * ( X[i+j ] + X[k-j+3] ); 356 sum[1] += Hrev[j+1] * ( X[i+j+1] + X[k-j+2] ); 357 sum[2] += Hrev[j+2] * ( X[i+j+2] + X[k-j+1] ); 358 sum[3] += Hrev[j+3] * ( X[i+j+3] + X[k-j ] ); 359 } 360 Y[i] = sum[0] + sum[1] + sum[2] + sum[3]; 361 } 362 return i; 363 } 364 } 365 else 366 { 367 const float * RESTRICT X = input; 368 const float * RESTRICT Hrev = p->H; 369 float * RESTRICT Y = output; 370 const int Nr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * p->N; 371 const int lenNr = ((p->flags & PFFASTCONV_CPLX_INP_OUT) ? 2 : 1) * (len - p->N); 372 int i, j; 373 374 if (p->flags & PFFASTCONV_CPLX_INP_OUT) 375 { 376 for ( i = 0; i <= lenNr; i += 2 ) 377 { 378 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; 379 for (j = 0; j < Nr; j += 4 ) 380 { 381 sum[0] += X[i+j] * Hrev[j]; 382 sum[1] += X[i+j+1] * Hrev[j+1]; 383 sum[2] += X[i+j+2] * Hrev[j+2]; 384 sum[3] += X[i+j+3] * Hrev[j+3]; 385 } 386 Y[i ] = sum[0] + sum[2]; 387 Y[i+1] = sum[1] + sum[3]; 388 } 389 return i/2; 390 } 391 else 392 { 393 if ( (Nr & 3) == 0 ) 394 { 395 for ( i = 0; i <= lenNr; ++i ) 396 { 397 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; 398 for (j = 0; j < Nr; j += 4 ) 399 { 400 sum[0] += X[i+j] * Hrev[j]; 401 sum[1] += X[i+j+1] * Hrev[j+1]; 402 sum[2] += X[i+j+2] * Hrev[j+2]; 403 sum[3] += X[i+j+3] * Hrev[j+3]; 404 } 405 Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]); 406 } 407 return i; 408 } 409 else 410 { 411 const int M = Nr & (~3); 412 /* printf("B: Nr = %d\n", Nr ); */ 413 for ( i = 0; i <= lenNr; ++i ) 414 { 415 float tailSum = 0.0; 416 sum[0] = sum[1] = sum[2] = sum[3] = 0.0F; 417 for (j = 0; j < M; j += 4 ) 418 { 419 sum[0] += X[i+j] * Hrev[j]; 420 sum[1] += X[i+j+1] * Hrev[j+1]; 421 sum[2] += X[i+j+2] * Hrev[j+2]; 422 sum[3] += X[i+j+3] * Hrev[j+3]; 423 } 424 for ( ; j < Nr; ++j ) 425 tailSum += X[i+j] * Hrev[j]; 426 Y[i] = (sum[0] + sum[1]) + (sum[2] + sum[3]) + tailSum; 427 } 428 return i; 429 } 430 } 431 } 432 433 } 434 435 436 int fast_conv(void * setup, const float * X, int len, float *Y, const float *Yref, int applyFlush) 437 { 438 (void)Yref; 439 return pffastconv_apply( (PFFASTCONV_Setup*)setup, X, len, Y, applyFlush ); 440 } 441 442 443 444 void printFirst( const float * V, const char * st, const int N, const int perLine ) 445 { 446 (void)V; (void)st; (void)N; (void)perLine; 447 return; 448 #if 0 449 int i; 450 for ( i = 0; i < N; ++i ) 451 { 452 if ( (i % perLine) == 0 ) 453 printf("\n%s[%d]", st, i); 454 printf("\t%.1f", V[i]); 455 } 456 printf("\n"); 457 #endif 458 } 459 460 461 462 #define NUMY 11 463 464 465 int test(int FILTERLEN, int convFlags, const int testOutLen, int printDbg, int printSpeed) { 466 double t0, t1, tstop, td, tdref; 467 float *X, *H; 468 float *Y[NUMY]; 469 int64_t outN[NUMY]; 470 /* 256 KFloats or 16 MFloats data */ 471 #if 1 472 const int len = testOutLen ? (1 << 18) : (1 << 24); 473 #elif 0 474 const int len = testOutLen ? (1 << 18) : (1 << 13); 475 #else 476 const int len = testOutLen ? (1 << 18) : (1024); 477 #endif 478 const int cplxFactor = ( convFlags & PFFASTCONV_CPLX_INP_OUT ) ? 2 : 1; 479 const int lenC = len / cplxFactor; 480 481 int yi, yc, posMaxErr; 482 float yRangeMin, yRangeMax, yErrLimit, maxErr = 0.0; 483 int i, j, numErrOverLimit, iter; 484 int retErr = 0; 485 486 /* 0 1 2 3 4 5 6 7 8 9 */ 487 pfnConvSetup aSetup[NUMY] = { convSetupRev, convSetupRev, convSetupRev, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup, fastConvSetup }; 488 pfnConvDestroy aDestroy[NUMY] = { convDestroyRev, convDestroyRev, convDestroyRev, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy, fastConvDestroy }; 489 pfnGetConvFnPtr aGetFnPtr[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; 490 pfnConvolution aConv[NUMY] = { slow_conv_R, slow_conv_A, slow_conv_B, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv, fast_conv }; 491 const char * convText[NUMY] = { "R(non-simd)", "A(non-simd)", "B(non-simd)", "fast_conv_64", "fast_conv_128", "fast_conv_256", "fast_conv_512", "fast_conv_1K", "fast_conv_2K", "fast_conv_4K" }; 492 int aFastAlgo[NUMY] = { 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }; 493 void * aSetupCfg[NUMY] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL }; 494 int aBlkLen[NUMY] = { 1024, 1024, 1024, 64, 128, 256, 512, 1024, 2048, 4096 }; 495 #if 1 496 int aRunAlgo[NUMY] = { 1, 1, 1, FILTERLEN<64, FILTERLEN<128, FILTERLEN<256, FILTERLEN<512, FILTERLEN<1024, FILTERLEN<2048, FILTERLEN<4096 }; 497 #elif 0 498 int aRunAlgo[NUMY] = { 1, 0, 0, 0 && FILTERLEN<64, 1 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096 }; 499 #else 500 int aRunAlgo[NUMY] = { 1, 1, 1, 0 && FILTERLEN<64, 0 && FILTERLEN<128, 1 && FILTERLEN<256, 0 && FILTERLEN<512, 0 && FILTERLEN<1024, 0 && FILTERLEN<2048, 0 && FILTERLEN<4096 }; 501 #endif 502 double aSpeedFactor[NUMY], aDuration[NUMY], procSmpPerSec[NUMY]; 503 504 X = pffastconv_malloc( (unsigned)(len+4) * sizeof(float) ); 505 for ( i=0; i < NUMY; ++i) 506 { 507 if ( 1 || i < 2 ) 508 Y[i] = pffastconv_malloc( (unsigned)len * sizeof(float) ); 509 else 510 Y[i] = Y[1]; 511 512 Y[i][0] = 123.F; /* test for pffft_zconvolve_no_accu() */ 513 aSpeedFactor[i] = -1.0; 514 aDuration[i] = -1.0; 515 procSmpPerSec[i] = -1.0; 516 } 517 518 H = pffastconv_malloc((unsigned)FILTERLEN * sizeof(float)); 519 520 /* initialize input */ 521 if ( convFlags & PFFASTCONV_CPLX_INP_OUT ) 522 { 523 for ( i = 0; i < lenC; ++i ) 524 { 525 X[2*i ] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */ 526 X[2*i+1] = (float)((i+2048) % 4093); 527 } 528 } 529 else 530 { 531 for ( i = 0; i < len; ++i ) 532 X[i] = (float)(i % 4093); /* 4094 is a prime number. see https://en.wikipedia.org/wiki/List_of_prime_numbers */ 533 } 534 X[ len ] = INVALID_FLOAT_VAL; 535 X[ len +1 ] = INVALID_FLOAT_VAL; 536 X[ len +2 ] = INVALID_FLOAT_VAL; 537 X[ len +3 ] = INVALID_FLOAT_VAL; 538 539 if (!testOutLen) 540 printFirst( X, "X", 64, 8 ); 541 542 /* filter coeffs */ 543 memset( H, 0, FILTERLEN * sizeof(float) ); 544 #if 1 545 if ( convFlags & PFFASTCONV_SYMMETRIC ) 546 { 547 const int half = FILTERLEN / 2; 548 for ( j = 0; j < half; ++j ) { 549 switch (j % 3) { 550 case 0: H[j] = H[FILTERLEN-1-j] = -1.0F; break; 551 case 1: H[j] = H[FILTERLEN-1-j] = 1.0F; break; 552 case 2: H[j] = H[FILTERLEN-1-j] = 0.5F; break; 553 } 554 } 555 } 556 else 557 { 558 for ( j = 0; j < FILTERLEN; ++j ) { 559 switch (j % 3) { 560 case 0: H[j] = -1.0F; break; 561 case 1: H[j] = 1.0F; break; 562 case 2: H[j] = 0.5F; break; 563 } 564 } 565 } 566 #else 567 H[0] = 1.0F; 568 H[FILTERLEN -1] = 1.0F; 569 #endif 570 if (!testOutLen) 571 printFirst( H, "H", FILTERLEN, 8 ); 572 573 printf("\n"); 574 printf("filterLen = %d\t%s%s\t%s:\n", FILTERLEN, 575 ((convFlags & PFFASTCONV_CPLX_INP_OUT)?"cplx":"real"), 576 (convFlags & PFFASTCONV_CPLX_INP_OUT)?((convFlags & PFFASTCONV_CPLX_SINGLE_FFT)?" single":" 2x") : "", 577 ((convFlags & PFFASTCONV_SYMMETRIC)?"symmetric":"non-sym") ); 578 579 while (1) 580 { 581 582 for ( yi = 0; yi < NUMY; ++yi ) 583 { 584 if (!aRunAlgo[yi]) 585 continue; 586 587 aSetupCfg[yi] = aSetup[yi]( H, FILTERLEN, &aBlkLen[yi], convFlags ); 588 589 /* get effective apply function ptr */ 590 if ( aSetupCfg[yi] && aGetFnPtr[yi] ) 591 aConv[yi] = aGetFnPtr[yi]( aSetupCfg[yi] ); 592 593 if ( aSetupCfg[yi] && aConv[yi] ) { 594 if (testOutLen) 595 { 596 t0 = uclock_sec(); 597 outN[yi] = aConv[yi]( aSetupCfg[yi], X, lenC, Y[yi], Y[0], 1 /* applyFlush */ ); 598 t1 = uclock_sec(); 599 td = t1 - t0; 600 } 601 else 602 { 603 const int blkLen = 4096; /* required for 'fast_conv_4K' */ 604 int64_t offC = 0, offS, Nout; 605 int k; 606 iter = 0; 607 outN[yi] = 0; 608 t0 = uclock_sec(); 609 tstop = t0 + 0.25; /* benchmark duration: 250 ms */ 610 do { 611 for ( k = 0; k < 128 && offC +blkLen < lenC; ++k ) 612 { 613 offS = cplxFactor * offC; 614 Nout = aConv[yi]( aSetupCfg[yi], X +offS, blkLen, Y[yi] +offS, Y[0], (offC +blkLen >= lenC) /* applyFlush */ ); 615 offC += Nout; 616 ++iter; 617 if ( !Nout ) 618 break; 619 if ( offC +blkLen >= lenC ) 620 { 621 outN[yi] += offC; 622 offC = 0; 623 } 624 } 625 t1 = uclock_sec(); 626 } while ( t1 < tstop ); 627 outN[yi] += offC; 628 td = t1 - t0; 629 procSmpPerSec[yi] = cplxFactor * (double)outN[yi] / td; 630 } 631 } 632 else 633 { 634 t0 = t1 = td = 0.0; 635 outN[yi] = 0; 636 } 637 aDuration[yi] = td; 638 if ( yi == 0 ) { 639 const float * Yvals = Y[0]; 640 const int64_t refOutLen = cplxFactor * outN[0]; 641 tdref = td; 642 if (printDbg) { 643 printf("convolution '%s' took: %f ms\n", convText[yi], td*1000.0); 644 printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor); 645 } 646 aSpeedFactor[yi] = 1.0; 647 /* */ 648 yRangeMin = FLT_MAX; 649 yRangeMax = FLT_MIN; 650 for ( i = 0; i < refOutLen; ++i ) 651 { 652 if ( yRangeMax < Yvals[i] ) yRangeMax = Yvals[i]; 653 if ( yRangeMin > Yvals[i] ) yRangeMin = Yvals[i]; 654 } 655 yErrLimit = fabsf(yRangeMax - yRangeMin) / ( 100.0F * 1000.0F ); 656 /* yErrLimit = 0.01F; */ 657 if (testOutLen) { 658 if (1) { 659 printf("reference output len = %" PRId64 " smp\n", outN[0]); 660 printf("reference output range |%.1f ..%.1f| = %.1f ==> err limit = %f\n", yRangeMin, yRangeMax, yRangeMax - yRangeMin, yErrLimit); 661 } 662 printFirst( Yvals, "Yref", 64, 8 ); 663 } 664 } 665 else 666 { 667 aSpeedFactor[yi] = tdref / td; 668 if (printDbg) { 669 printf("\nconvolution '%s' took: %f ms == %f %% == %f X\n", convText[yi], td*1000.0, td * 100 / tdref, tdref / td); 670 printf(" convolution '%s' output size %" PRId64 " == (cplx) len %d + %" PRId64 "\n", convText[yi], outN[yi], len / cplxFactor, outN[yi] - len / cplxFactor); 671 } 672 } 673 } 674 675 int iMaxSpeedSlowAlgo = -1; 676 int iFirstFastAlgo = -1; 677 int iMaxSpeedFastAlgo = -1; 678 int iPrintedRefOutLen = 0; 679 { 680 for ( yc = 1; yc < NUMY; ++yc ) 681 { 682 if (!aRunAlgo[yc]) 683 continue; 684 if (aFastAlgo[yc]) { 685 if ( iMaxSpeedFastAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedFastAlgo] ) 686 iMaxSpeedFastAlgo = yc; 687 688 if (iFirstFastAlgo < 0) 689 iFirstFastAlgo = yc; 690 } 691 else 692 { 693 if ( iMaxSpeedSlowAlgo < 0 || aSpeedFactor[yc] > aSpeedFactor[iMaxSpeedSlowAlgo] ) 694 iMaxSpeedSlowAlgo = yc; 695 } 696 } 697 698 if (printSpeed) 699 { 700 if (testOutLen) 701 { 702 if (iMaxSpeedSlowAlgo >= 0 ) 703 printf("fastest slow algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedSlowAlgo], aSpeedFactor[iMaxSpeedSlowAlgo], 1000.0 * aDuration[iMaxSpeedSlowAlgo]); 704 if (0 != iMaxSpeedSlowAlgo && aRunAlgo[0]) 705 printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[0], aSpeedFactor[0], 1000.0 * aDuration[0]); 706 if (1 != iMaxSpeedSlowAlgo && aRunAlgo[1]) 707 printf("slow algorithm '%s' at speed %f X ; abs duration %f ms\n", convText[1], aSpeedFactor[1], 1000.0 * aDuration[1]); 708 709 if (iFirstFastAlgo >= 0 && iFirstFastAlgo != iMaxSpeedFastAlgo && aRunAlgo[iFirstFastAlgo]) 710 printf("first fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo], aSpeedFactor[iFirstFastAlgo], 1000.0 * aDuration[iFirstFastAlgo]); 711 if (iFirstFastAlgo >= 0 && iFirstFastAlgo+1 != iMaxSpeedFastAlgo && iFirstFastAlgo+1 < NUMY && aRunAlgo[iFirstFastAlgo+1]) 712 printf("2nd fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iFirstFastAlgo+1], aSpeedFactor[iFirstFastAlgo+1], 1000.0 * aDuration[iFirstFastAlgo+1]); 713 714 if ( 0 <= iMaxSpeedFastAlgo && iMaxSpeedFastAlgo < NUMY && aRunAlgo[iMaxSpeedFastAlgo] ) 715 { 716 printf("fastest fast algorithm is '%s' at speed %f X ; abs duration %f ms\n", convText[iMaxSpeedFastAlgo], aSpeedFactor[iMaxSpeedFastAlgo], 1000.0 * aDuration[iMaxSpeedFastAlgo]); 717 if ( 0 <= iMaxSpeedSlowAlgo && iMaxSpeedSlowAlgo < NUMY && aRunAlgo[iMaxSpeedSlowAlgo] ) 718 printf("fast / slow ratio: %f X\n", aSpeedFactor[iMaxSpeedFastAlgo] / aSpeedFactor[iMaxSpeedSlowAlgo] ); 719 } 720 printf("\n"); 721 } 722 else 723 { 724 for ( yc = 0; yc < NUMY; ++yc ) 725 { 726 if (!aRunAlgo[yc] || procSmpPerSec[yc] <= 0.0) 727 continue; 728 printf("algo '%s':\t%.2f MSmp\tin\t%.1f ms\t= %g kSmpPerSec\n", 729 convText[yc], (double)outN[yc]/(1000.0 * 1000.0), 1000.0 * aDuration[yc], procSmpPerSec[yc] * 0.001 ); 730 } 731 } 732 733 } 734 } 735 736 737 for ( yc = 1; yc < NUMY; ++yc ) 738 { 739 const float * Yref; 740 const float * Ycurr; 741 int outMin; 742 743 if (!aRunAlgo[yc]) 744 continue; 745 746 if (printDbg) 747 printf("\n"); 748 749 if ( outN[yc] == 0 ) 750 { 751 printf("output size 0: '%s' not implemented\n", convText[yc]); 752 } 753 else if ( outN[0] != outN[yc] /* && aFastAlgo[yc] */ && testOutLen ) 754 { 755 if (!iPrintedRefOutLen) 756 { 757 printf("reference output size = %" PRId64 ", delta to (cplx) input length = %" PRId64 " smp\n", outN[0], (len / cplxFactor) - outN[0]); 758 iPrintedRefOutLen = 1; 759 } 760 printf("output size doesn't match!: ref (FILTERLEN %d) returned %" PRId64 " smp, '%s' returned %" PRId64 " smp : delta = %" PRId64 " smp\n", 761 FILTERLEN, outN[0], convText[yc], outN[yc], outN[yc] - outN[0] ); 762 retErr = 1; 763 } 764 765 posMaxErr = 0; 766 maxErr = -1.0; 767 Yref = Y[0]; 768 Ycurr = Y[yc]; 769 outMin = ( outN[yc] < outN[0] ) ? outN[yc] : outN[0]; 770 numErrOverLimit = 0; 771 for ( i = 0; i < outMin; ++i ) 772 { 773 if ( numErrOverLimit < 6 && fabs(Ycurr[i] - Yref[i]) >= yErrLimit ) 774 { 775 printf("algo '%s': at %d: ***ERROR*** = %f, errLimit = %f, ref = %f, actual = %f\n", 776 convText[yc], i, fabs(Ycurr[i] - Yref[i]), yErrLimit, Yref[i], Ycurr[i] ); 777 ++numErrOverLimit; 778 } 779 780 if ( fabs(Ycurr[i] - Yref[i]) > maxErr ) 781 { 782 maxErr = fabsf(Ycurr[i] - Yref[i]); 783 posMaxErr = i; 784 } 785 } 786 787 if ( printDbg || (iMaxSpeedSlowAlgo == i) || (iMaxSpeedFastAlgo == i) ) 788 printf("max difference for '%s' is %g at sample idx %d of max inp 4093-1 == %f %%\n", convText[yc], maxErr, posMaxErr, maxErr * 100.0 / 4092.0 ); 789 } 790 791 break; 792 } 793 794 pffastconv_free(X); 795 for ( i=0; i < NUMY; ++i) 796 { 797 if ( 1 || i < 2 ) 798 pffastconv_free( Y[i] ); 799 if (!aRunAlgo[i]) 800 continue; 801 aDestroy[i]( aSetupCfg[i] ); 802 } 803 804 pffastconv_free(H); 805 806 return retErr; 807 } 808 809 /* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */ 810 void validate_pffft_simd(); 811 int validate_pffft_simd_ex(FILE * DbgOut); 812 813 814 int main(int argc, char **argv) 815 { 816 int result = 0; 817 int i, k, M, flagsA, flagsB, flagsC, testOutLen, printDbg, printSpeed; 818 int testOutLens = 1, benchConv = 1, quickTest = 0, slowTest = 0; 819 int testReal = 1, testCplx = 1, testSymetric = 0; 820 821 for ( i = 1; i < argc; ++i ) { 822 823 if (!strcmp(argv[i], "--test-simd")) { 824 int numErrs = validate_pffft_simd_ex(stdout); 825 fprintf( ( numErrs != 0 ? stderr : stdout ), "validate_pffft_simd_ex() returned %d errors!\n", numErrs); 826 return ( numErrs > 0 ? 1 : 0 ); 827 } 828 829 if (!strcmp(argv[i], "--no-len")) { 830 testOutLens = 0; 831 } 832 else if (!strcmp(argv[i], "--no-bench")) { 833 benchConv = 0; 834 } 835 else if (!strcmp(argv[i], "--quick")) { 836 quickTest = 1; 837 } 838 else if (!strcmp(argv[i], "--slow")) { 839 slowTest = 1; 840 } 841 else if (!strcmp(argv[i], "--real")) { 842 testCplx = 0; 843 } 844 else if (!strcmp(argv[i], "--cplx")) { 845 testReal = 0; 846 } 847 else if (!strcmp(argv[i], "--sym")) { 848 testSymetric = 1; 849 } 850 else /* if (!strcmp(argv[i], "--help")) */ { 851 printf("usage: %s [--test-simd] [--no-len] [--no-bench] [--quick|--slow] [--real|--cplx] [--sym]\n", argv[0]); 852 exit(1); 853 } 854 } 855 856 857 if (testOutLens) 858 { 859 for ( k = 0; k < 3; ++k ) 860 { 861 if ( (k == 0 && !testReal) || (k > 0 && !testCplx) ) 862 continue; 863 printf("\n\n==========\n"); 864 printf("testing %s %s output lengths ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) ); 865 printf("==========\n"); 866 flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT; 867 flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 ); 868 flagsC = flagsB | PFFASTCONV_CPLX_SINGLE_FFT; 869 testOutLen = 1; 870 printDbg = 0; 871 printSpeed = 0; 872 for ( M = 128 - 4; M <= (quickTest ? 128+16 : 256); ++M ) 873 { 874 if ( (M % 16) != 0 && testSymetric ) 875 continue; 876 result |= test(M, flagsB, testOutLen, printDbg, printSpeed); 877 } 878 } 879 } 880 881 if (benchConv) 882 { 883 for ( k = 0; k < 3; ++k ) 884 { 885 if ( (k == 0 && !testReal) || (k > 0 && !testCplx) ) 886 continue; 887 printf("\n\n==========\n"); 888 printf("starting %s %s benchmark against linear convolutions ..\n", (k == 0 ? "real" : "cplx"), ( k == 0 ? "" : (k==1 ? "2x" : "single") ) ); 889 printf("==========\n"); 890 flagsA = (k == 0) ? 0 : PFFASTCONV_CPLX_INP_OUT; 891 flagsB = flagsA | ( testSymetric ? PFFASTCONV_SYMMETRIC : 0 ); 892 flagsC = flagsB | ( k == 2 ? PFFASTCONV_CPLX_SINGLE_FFT : 0 ); 893 testOutLen = 0; 894 printDbg = 0; 895 printSpeed = 1; 896 if (!slowTest) { 897 result |= test( 32, flagsC, testOutLen, printDbg, printSpeed); 898 result |= test( 32+ 16, flagsC, testOutLen, printDbg, printSpeed); 899 result |= test( 64, flagsC, testOutLen, printDbg, printSpeed); 900 result |= test( 64+ 32, flagsC, testOutLen, printDbg, printSpeed); 901 result |= test(128, flagsC, testOutLen, printDbg, printSpeed); 902 } 903 if (!quickTest) { 904 result |= test(128+ 64, flagsC, testOutLen, printDbg, printSpeed); 905 result |= test(256, flagsC, testOutLen, printDbg, printSpeed); 906 result |= test(256+128, flagsC, testOutLen, printDbg, printSpeed); 907 result |= test(512, flagsC, testOutLen, printDbg, printSpeed); 908 result |= test(1024, flagsC, testOutLen, printDbg, printSpeed); 909 } 910 } 911 } 912 913 return result; 914 } 915 916