xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/contractions/contractions.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 
18 #include <string.h>
19 #include <stdio.h>
20 
21 #if !defined(_WIN32)
22 #include <libgen.h>
23 #include <sys/param.h>
24 #endif
25 
26 #include "mingw_compat.h"
27 #if defined (__MINGW32__)
28 #include <sys/param.h>
29 #endif
30 
31 #include <time.h>
32 #include "errorHelpers.h"
33 #include "harness/compat.h"
34 #include "harness/mt19937.h"
35 #include "harness/kernelHelpers.h"
36 #include "harness/rounding_mode.h"
37 #include "harness/fpcontrol.h"
38 #include "harness/testHarness.h"
39 #include "harness/parseParameters.h"
40 #if defined( __APPLE__ )
41 #include <sys/sysctl.h>
42 #endif
43 #if defined( __linux__ )
44 #include <unistd.h>
45 #include <sys/syscall.h>
46 #include <linux/sysctl.h>
47 #endif
48 
49 #if defined (_WIN32)
50 #include <string.h>
51 #endif
52 
53 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
54 #include <emmintrin.h>
55 #endif
56 
57 #if defined(__PPC__)
58 // Global varaiable used to hold the FPU control register state. The FPSCR register can not
59 // be used because not all Power implementations retain or observed the NI (non-IEEE
60 // mode) bit.
61 __thread fpu_control_t fpu_control = 0;
62 #endif
63 
64 #ifndef MAXPATHLEN
65 #define MAXPATHLEN  2048
66 #endif
67 
68 char                appName[ MAXPATHLEN ] = "";
69 cl_context          gContext = NULL;
70 cl_command_queue    gQueue = NULL;
71 cl_program          gProgram[5] = { NULL, NULL, NULL, NULL, NULL };
72 cl_program          gProgram_double[5] = { NULL, NULL, NULL, NULL, NULL };
73 int                 gForceFTZ = 0;
74 int                 gSeed = 0;
75 int                 gSeedSpecified = 0;
76 int                 gHasDouble = 0;
77 MTdata              gMTdata = NULL;
78 int                 gSkipNanInf = 0;
79 int                     gIgnoreZeroSign = 0;
80 
81 cl_mem              bufA = NULL;
82 cl_mem              bufB = NULL;
83 cl_mem              bufC = NULL;
84 cl_mem              bufD = NULL;
85 cl_mem              bufE = NULL;
86 cl_mem              bufC_double = NULL;
87 cl_mem              bufD_double = NULL;
88 float               *buf1, *buf2, *buf3, *buf4, *buf5, *buf6;
89 float               *correct[8];
90 int                     *skipTest[8];
91 
92 double              *buf3_double, *buf4_double, *buf5_double, *buf6_double;
93 double              *correct_double[8];
94 
95 static const char   **gArgList;
96 static size_t       gArgCount;
97 
98 #define BUFFER_SIZE         (1024*1024)
99 
100 
101 static int ParseArgs( int argc, const char **argv );
102 static void PrintUsage( void );
103 test_status InitCL( cl_device_id device );
104 static void ReleaseCL( void );
105 static int RunTest( int testNumber );
106 static int RunTest_Double( int testNumber );
107 
108 #if defined(__ANDROID__)
109 #define nanf( X ) strtof( "NAN", ( char ** ) NULL )
110 #define nan( X )  strtod( "NAN", ( char ** ) NULL )
111 #endif
112 
113 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
114 // defeat x87 on MSVC
sse_add(float x,float y)115 float sse_add(float x, float y)
116 {
117     volatile float a = x;
118     volatile float b = y;
119 
120     // defeat x87
121     __m128 va = _mm_set_ss( (float) a );
122     __m128 vb = _mm_set_ss( (float) b );
123     va = _mm_add_ss( va, vb );
124     _mm_store_ss( (float*) &a, va );
125     return a;
126 }
127 
sse_add_sd(double x,double y)128 double sse_add_sd(double x, double y)
129 {
130     volatile double a = x;
131     volatile double b = y;
132 
133     // defeat x87
134     __m128d va = _mm_set_sd( (double) a );
135     __m128d vb = _mm_set_sd( (double) b );
136     va = _mm_add_sd( va, vb );
137     _mm_store_sd( (double*) &a, va );
138     return a;
139 }
140 
sse_sub(float x,float y)141 float sse_sub(float x, float y)
142 {
143     volatile float a = x;
144     volatile float b = y;
145 
146     // defeat x87
147     __m128 va = _mm_set_ss( (float) a );
148     __m128 vb = _mm_set_ss( (float) b );
149     va = _mm_sub_ss( va, vb );
150     _mm_store_ss( (float*) &a, va );
151     return a;
152 }
153 
sse_sub_sd(double x,double y)154 double sse_sub_sd(double x, double y)
155 {
156     volatile double a = x;
157     volatile double b = y;
158 
159     // defeat x87
160     __m128d va = _mm_set_sd( (double) a );
161     __m128d vb = _mm_set_sd( (double) b );
162     va = _mm_sub_sd( va, vb );
163     _mm_store_sd( (double*) &a, va );
164     return a;
165 }
166 
sse_mul(float x,float y)167 float sse_mul(float x, float y)
168 {
169     volatile float a = x;
170     volatile float b = y;
171 
172     // defeat x87
173     __m128 va = _mm_set_ss( (float) a );
174     __m128 vb = _mm_set_ss( (float) b );
175     va = _mm_mul_ss( va, vb );
176     _mm_store_ss( (float*) &a, va );
177     return a;
178 }
179 
sse_mul_sd(double x,double y)180 double sse_mul_sd(double x, double y)
181 {
182     volatile double a = x;
183     volatile double b = y;
184 
185     // defeat x87
186     __m128d va = _mm_set_sd( (double) a );
187     __m128d vb = _mm_set_sd( (double) b );
188     va = _mm_mul_sd( va, vb );
189     _mm_store_sd( (double*) &a, va );
190     return a;
191 }
192 #endif
193 
194 #ifdef __PPC__
ppc_mul(float a,float b)195 float ppc_mul(float a, float b)
196 {
197     float p;
198 
199     if (gForceFTZ) {
200         // Flush input a to zero if it is sub-normal
201         if (fabsf(a) < FLT_MIN) {
202             a = copysignf(0.0, a);
203         }
204         // Flush input b to zero if it is sub-normal
205         if (fabsf(b) < FLT_MIN) {
206             b = copysignf(0.0, b);
207         }
208         // Perform multiply
209         p = a * b;
210         // Flush the product if it is a sub-normal
211         if (fabs((double)a * (double)b) < FLT_MIN) {
212             p = copysignf(0.0, p);
213         }
214     } else {
215         p = a * b;
216     }
217     return p;
218 }
219 #endif
220 
test_contractions_float_0(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)221 int test_contractions_float_0(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
222 {
223     return RunTest(0);
224 }
225 
test_contractions_float_1(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)226 int test_contractions_float_1(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
227 {
228     return RunTest(1);
229 }
230 
test_contractions_float_2(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)231 int test_contractions_float_2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
232 {
233     return RunTest(2);
234 }
235 
test_contractions_float_3(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)236 int test_contractions_float_3(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
237 {
238     return RunTest(3);
239 }
240 
test_contractions_float_4(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)241 int test_contractions_float_4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
242 {
243     return RunTest(4);
244 }
245 
test_contractions_float_5(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)246 int test_contractions_float_5(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
247 {
248     return RunTest(5);
249 }
250 
test_contractions_float_6(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)251 int test_contractions_float_6(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
252 {
253     return RunTest(6);
254 }
255 
test_contractions_float_7(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)256 int test_contractions_float_7(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
257 {
258     return RunTest(7);
259 }
260 
test_contractions_double_0(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)261 int test_contractions_double_0(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
262 {
263     return RunTest_Double(0);
264 }
265 
test_contractions_double_1(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)266 int test_contractions_double_1(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
267 {
268     return RunTest_Double(1);
269 }
270 
test_contractions_double_2(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)271 int test_contractions_double_2(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
272 {
273     return RunTest_Double(2);
274 }
275 
test_contractions_double_3(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)276 int test_contractions_double_3(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
277 {
278     return RunTest_Double(3);
279 }
280 
test_contractions_double_4(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)281 int test_contractions_double_4(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
282 {
283     return RunTest_Double(4);
284 }
285 
test_contractions_double_5(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)286 int test_contractions_double_5(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
287 {
288     return RunTest_Double(5);
289 }
290 
test_contractions_double_6(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)291 int test_contractions_double_6(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
292 {
293     return RunTest_Double(6);
294 }
295 
test_contractions_double_7(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)296 int test_contractions_double_7(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
297 {
298     return RunTest_Double(7);
299 }
300 
301 test_definition test_list[] = {
302     ADD_TEST( contractions_float_0 ),
303     ADD_TEST( contractions_float_1 ),
304     ADD_TEST( contractions_float_2 ),
305     ADD_TEST( contractions_float_3 ),
306     ADD_TEST( contractions_float_4 ),
307     ADD_TEST( contractions_float_5 ),
308     ADD_TEST( contractions_float_6 ),
309     ADD_TEST( contractions_float_7 ),
310     ADD_TEST( contractions_double_0 ),
311     ADD_TEST( contractions_double_1 ),
312     ADD_TEST( contractions_double_2 ),
313     ADD_TEST( contractions_double_3 ),
314     ADD_TEST( contractions_double_4 ),
315     ADD_TEST( contractions_double_5 ),
316     ADD_TEST( contractions_double_6 ),
317     ADD_TEST( contractions_double_7 ),
318 };
319 
320 const int test_num = ARRAY_SIZE( test_list );
321 
main(int argc,const char ** argv)322 int main( int argc, const char **argv )
323 {
324     argc = parseCustomParam(argc, argv);
325     if (argc == -1)
326     {
327         return -1;
328     }
329 
330     int error = ParseArgs( argc, argv );
331 
332     if( !error )
333     {
334         error = runTestHarnessWithCheck( gArgCount, gArgList, test_num, test_list, true, 0, InitCL );
335     }
336 
337     if( gQueue )
338     {
339         int flush_error = clFinish( gQueue );
340         if( flush_error )
341             log_error( "clFinish failed: %d\n", flush_error );
342     }
343 
344     ReleaseCL();
345     free( gArgList );
346 
347     return error;
348 }
349 
350 
351 
ParseArgs(int argc,const char ** argv)352 static int ParseArgs( int argc, const char **argv )
353 {
354     gArgList = (const char **)calloc( argc, sizeof( char*) );
355 
356     if( NULL == gArgList )
357     {
358         vlog_error( "Failed to allocate memory for argList\n" );
359         return 1;
360     }
361 
362     gArgList[0] = argv[0];
363     gArgCount = 1;
364 
365     int length_of_seed = 0;
366 
367     { // Extract the app name
368         strncpy( appName, argv[0], MAXPATHLEN );
369 
370 #if (defined( __APPLE__ ) || defined(__linux__) || defined(__MINGW32__))
371         char baseName[MAXPATHLEN];
372         char *base = NULL;
373         strncpy( baseName, argv[0], MAXPATHLEN );
374         base = basename( baseName );
375         if( NULL != base )
376         {
377             strncpy( appName, base, sizeof( appName )  );
378             appName[ sizeof( appName ) -1 ] = '\0';
379         }
380 #elif defined (_WIN32)
381         char fname[_MAX_FNAME + _MAX_EXT + 1];
382         char ext[_MAX_EXT];
383 
384         errno_t err = _splitpath_s( argv[0], NULL, 0, NULL, 0,
385                                    fname, _MAX_FNAME, ext, _MAX_EXT );
386         if (err == 0) { // no error
387             strcat (fname, ext); //just cat them, size of frame can keep both
388             strncpy (appName, fname, sizeof(appName));
389             appName[ sizeof( appName ) -1 ] = '\0';
390         }
391 #endif
392     }
393 
394     for( int i = 1; i < argc; i++ )
395     {
396         const char *arg = argv[i];
397         if( NULL == arg )
398             break;
399 
400         if( arg[0] == '-' )
401         {
402             while( arg[1] != '\0' )
403             {
404                 arg++;
405                 switch( *arg )
406                 {
407                     case 'h':
408                         PrintUsage();
409                         return -1;
410 
411                     case 's':
412                         arg++;
413                         gSeed = atoi( arg );
414                         while (arg[length_of_seed] >='0' && arg[length_of_seed]<='9')
415                             length_of_seed++;
416                         gSeedSpecified = 1;
417                         arg+=length_of_seed-1;
418                         break;
419 
420                     case 'z':
421                         gForceFTZ ^= 1;
422                         break;
423 
424                     default:
425                         vlog( " <-- unknown flag: %c (0x%2.2x)\n)", *arg, *arg );
426                         PrintUsage();
427                         return -1;
428                 }
429             }
430         }
431         else
432         {
433             gArgList[gArgCount] = arg;
434             gArgCount++;
435         }
436     }
437 
438     PrintArch();
439 
440     return 0;
441 }
442 
PrintUsage(void)443 static void PrintUsage( void )
444 {
445     vlog( "%s [-z]: <optional: test names>\n", appName );
446     vlog( "\tOptions:\n" );
447     vlog( "\t\t-z\tToggle FTZ mode (Section 6.5.3) for all functions. (Set by device capabilities by default.)\n" );
448     vlog( "\t\t-sNUMBER set random seed.\n");
449     vlog( "\n" );
450     vlog( "\tTest names:\n" );
451     for( int i = 0; i < test_num; i++ )
452     {
453         vlog( "\t\t%s\n", test_list[i].name );
454     }
455 }
456 
457 const char *sizeNames[] = { "float", "float2", "float4", "float8", "float16" };
458 const char *sizeNames_double[] = { "double", "double2", "double4", "double8", "double16" };
459 
InitCL(cl_device_id device)460 test_status InitCL( cl_device_id device )
461 {
462     int error;
463     uint32_t i, j;
464     int *bufSkip = NULL;
465     int isRTZ = 0;
466     RoundingMode oldRoundMode = kDefaultRoundingMode;
467 
468     cl_device_fp_config floatCapabilities = 0;
469     if( (error = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(floatCapabilities), &floatCapabilities, NULL)))
470         floatCapabilities = 0;
471     if(0 == (CL_FP_DENORM & floatCapabilities) )
472         gForceFTZ ^= 1;
473 
474     // check for cl_khr_fp64
475     gHasDouble = is_extension_available(device, "cl_khr_fp64" );
476 
477     if(0 == (CL_FP_INF_NAN & floatCapabilities) )
478         gSkipNanInf = 1;
479 
480     // Embedded devices that flush to zero are allowed to have an undefined sign.
481     if (gIsEmbedded && gForceFTZ)
482         gIgnoreZeroSign = 1;
483 
484     gContext = clCreateContext( NULL, 1, &device, notify_callback, NULL, &error );
485     if( NULL == gContext || error )
486     {
487         vlog_error( "clCreateDeviceGroup failed. %d\n", error );
488         return TEST_FAIL;
489     }
490 
491     gQueue = clCreateCommandQueue( gContext, device, 0, &error );
492     if( NULL == gQueue || error )
493     {
494         vlog_error( "clCreateContext failed. %d\n", error );
495         return TEST_FAIL;
496     }
497 
498     // setup input buffers
499     bufA = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
500     bufB = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
501     bufC = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
502     bufD = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
503     bufE = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
504 
505     if( bufA == NULL    ||
506        bufB == NULL    ||
507        bufC == NULL    ||
508        bufD == NULL    ||
509        bufE == NULL    )
510     {
511         vlog_error( "clCreateArray failed for input\n" );
512         return TEST_FAIL;
513     }
514 
515     if( gHasDouble )
516     {
517         bufC_double = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
518         bufD_double = clCreateBuffer(  gContext,  CL_MEM_READ_WRITE, BUFFER_SIZE, NULL, NULL );
519         if( bufC_double == NULL    ||
520            bufD_double == NULL    )
521         {
522             vlog_error( "clCreateArray failed for input DP\n" );
523             return TEST_FAIL;
524         }
525     }
526 
527     const char *kernels[] = {
528         "", "#pragma OPENCL FP_CONTRACT OFF\n"
529         "__kernel void kernel1( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
530         "{\n"
531         "   int i = get_global_id(0);\n"
532         "   out[i] = a[i] * b[i] + c[i];\n"
533         "}\n"
534         "\n"
535         "__kernel void kernel2( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
536         "{\n"
537         "   int i = get_global_id(0);\n"
538         "   out[i] = a[i] * b[i] - c[i];\n"
539         "}\n"
540         "\n"
541         "__kernel void kernel3( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
542         "{\n"
543         "   int i = get_global_id(0);\n"
544         "   out[i] = c[i] + a[i] * b[i];\n"
545         "}\n"
546         "\n"
547         "__kernel void kernel4( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
548         "{\n"
549         "   int i = get_global_id(0);\n"
550         "   out[i] = c[i] - a[i] * b[i];\n"
551         "}\n"
552         "\n"
553         "__kernel void kernel5( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
554         "{\n"
555         "   int i = get_global_id(0);\n"
556         "   out[i] = -(a[i] * b[i] + c[i]);\n"
557         "}\n"
558         "\n"
559         "__kernel void kernel6( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
560         "{\n"
561         "   int i = get_global_id(0);\n"
562         "   out[i] = -(a[i] * b[i] - c[i]);\n"
563         "}\n"
564         "\n"
565         "__kernel void kernel7( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
566         "{\n"
567         "   int i = get_global_id(0);\n"
568         "   out[i] = -(c[i] + a[i] * b[i]);\n"
569         "}\n"
570         "\n"
571         "__kernel void kernel8( __global ", NULL, " *out, const __global ", NULL, " *a, const __global ", NULL, " *b, const __global ", NULL, " *c )\n"
572         "{\n"
573         "   int i = get_global_id(0);\n"
574         "   out[i] = -(c[i] - a[i] * b[i]);\n"
575         "}\n"
576         "\n" };
577 
578     for (i = 0; i < sizeof(sizeNames) / sizeof(sizeNames[0]); i++)
579     {
580         size_t strCount = sizeof(kernels) / sizeof(kernels[0]);
581         kernels[0] = "";
582 
583         for (j = 2; j < strCount; j += 2) kernels[j] = sizeNames[i];
584         error = create_single_kernel_helper(gContext, &gProgram[i], nullptr,
585                                             strCount, kernels, nullptr);
586         if (CL_SUCCESS != error || nullptr == gProgram[i])
587         {
588             log_error("Error: Unable to create test program! (%s) (in %s:%d)\n",
589                       IGetErrorString(error), __FILE__, __LINE__);
590             return TEST_FAIL;
591         }
592     }
593 
594     if (gHasDouble)
595     {
596         kernels[0] = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
597         for (i = 0; i < sizeof(sizeNames_double) / sizeof(sizeNames_double[0]);
598              i++)
599         {
600             size_t strCount = sizeof(kernels) / sizeof(kernels[0]);
601 
602             for (j = 2; j < strCount; j += 2) kernels[j] = sizeNames_double[i];
603             error = create_single_kernel_helper(gContext, &gProgram_double[i],
604                                                 nullptr, strCount, kernels,
605                                                 nullptr);
606             if (CL_SUCCESS != error || nullptr == gProgram_double[i])
607             {
608                 log_error(
609                     "Error: Unable to create test program! (%s) (in %s:%d)\n",
610                     IGetErrorString(error), __FILE__, __LINE__);
611                 return TEST_FAIL;
612             }
613         }
614     }
615 
616     if( 0 == gSeedSpecified )
617     {
618         time_t currentTime = time( NULL );
619         struct tm *t = localtime(&currentTime);
620         gSeed = t->tm_sec + 60 * ( t->tm_min + 60 * (t->tm_hour + 24 * (t->tm_yday + 365 * t->tm_year)));
621         gSeed = (uint32_t) (((uint64_t) gSeed * (uint64_t) gSeed ) >> 16);
622     }
623     gMTdata = init_genrand( gSeed );
624 
625 
626     // Init bufA and bufB
627     {
628         buf1 = (float *)malloc( BUFFER_SIZE );
629         buf2 = (float *)malloc( BUFFER_SIZE );
630         buf3 = (float *)malloc( BUFFER_SIZE );
631         buf4 = (float *)malloc( BUFFER_SIZE );
632         buf5 = (float *)malloc( BUFFER_SIZE );
633         buf6 = (float *)malloc( BUFFER_SIZE );
634 
635         bufSkip = (int *)malloc( BUFFER_SIZE );
636 
637         if( NULL == buf1 || NULL == buf2 || NULL == buf3 || NULL == buf4 || NULL == buf5 || NULL == buf6 || NULL == bufSkip)
638         {
639             vlog_error( "Out of memory initializing buffers\n" );
640             return TEST_FAIL;
641         }
642         for( i = 0; i < sizeof( correct ) / sizeof( correct[0] ); i++ )
643         {
644             correct[i] = (float *)malloc( BUFFER_SIZE );
645             skipTest[i] = (int *)malloc( BUFFER_SIZE );
646             if(( NULL == correct[i] ) || ( NULL == skipTest[i]))
647             {
648                 vlog_error( "Out of memory initializing buffers 2\n" );
649                 return TEST_FAIL;
650             }
651         }
652 
653         for( i = 0; i < BUFFER_SIZE / sizeof(float); i++ )
654             ((uint32_t*) buf1)[i] = genrand_int32( gMTdata );
655 
656         if( (error = clEnqueueWriteBuffer(gQueue, bufA, CL_FALSE, 0, BUFFER_SIZE, buf1, 0, NULL, NULL) ))
657         {
658             vlog_error( "Failure %d at clEnqueueWriteBuffer1\n", error );
659             return TEST_FAIL;
660         }
661 
662         for( i = 0; i < BUFFER_SIZE / sizeof(float); i++ )
663             ((uint32_t*) buf2)[i] = genrand_int32( gMTdata );
664 
665         if( (error = clEnqueueWriteBuffer(gQueue, bufB, CL_FALSE, 0, BUFFER_SIZE, buf2, 0, NULL, NULL) ))
666         {
667             vlog_error( "Failure %d at clEnqueueWriteBuffer2\n", error );
668             return TEST_FAIL;
669         }
670 
671         void *ftzInfo = NULL;
672         if( gForceFTZ )
673             ftzInfo = FlushToZero();
674         if ((CL_FP_ROUND_TO_ZERO == get_default_rounding_mode(device)) && gIsEmbedded) {
675             oldRoundMode = set_round(kRoundTowardZero, kfloat);
676             isRTZ = 1;
677         }
678         float *f = (float*) buf1;
679         float *f2 = (float*) buf2;
680         float *f3 = (float*) buf3;
681         float *f4 = (float*) buf4;
682         for( i = 0; i < BUFFER_SIZE / sizeof(float); i++ )
683         {
684             float q = f[i];
685             float q2 = f2[i];
686 
687             feclearexcept(FE_OVERFLOW);
688 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
689             // VS2005 might use x87 for straight multiplies, and we can't
690             // turn that off
691             f3[i] = sse_mul(q, q2);
692             f4[i] = sse_mul(-q, q2);
693 #elif defined(__PPC__)
694             // None of the current generation PPC processors support HW
695             // FTZ, emulate it in sw.
696             f3[i] = ppc_mul(q, q2);
697             f4[i] = ppc_mul(-q, q2);
698 #else
699             f3[i] = q * q2;
700             f4[i] = -q * q2;
701 #endif
702             // Skip test if the device doesn't support infinities and NaN AND the result overflows
703             // or either input is an infinity of NaN
704             bufSkip[i] = (gSkipNanInf && ((FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW))) ||
705                                           (fabsf(q)  == FLT_MAX) || (q  != q)  ||
706                                           (fabsf(q2) == FLT_MAX) || (q2 != q2)));
707         }
708 
709         if( gForceFTZ )
710             UnFlushToZero(ftzInfo);
711 
712     if (isRTZ)
713       (void)set_round(oldRoundMode, kfloat);
714 
715 
716         if( (error = clEnqueueWriteBuffer(gQueue, bufC, CL_FALSE, 0, BUFFER_SIZE, buf3, 0, NULL, NULL) ))
717         {
718             vlog_error( "Failure %d at clEnqueueWriteBuffer3\n", error );
719             return TEST_FAIL;
720         }
721         if( (error = clEnqueueWriteBuffer(gQueue, bufD, CL_FALSE, 0, BUFFER_SIZE, buf4, 0, NULL, NULL) ))
722         {
723             vlog_error( "Failure %d at clEnqueueWriteBuffer4\n", error );
724             return TEST_FAIL;
725         }
726 
727         // Fill the buffers with NaN
728         float *f5 = (float*) buf5;
729         float nan_val = nanf("");
730         for( i = 0; i < BUFFER_SIZE / sizeof( float ); i++ )
731             f5[i] = nan_val;
732 
733         // calculate reference results
734         for( i = 0; i < BUFFER_SIZE / sizeof( float ); i++ )
735         {
736             for ( j=0; j<8; j++)
737             {
738                 feclearexcept(FE_OVERFLOW);
739                 switch (j)
740                 {
741 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
742                         // VS2005 might use x87 for straight add/sub, and we can't
743                         // turn that off
744                     case 0:
745                         correct[0][i] = sse_add(buf3[i],buf4[i]); break;
746                     case 1:
747                         correct[1][i] = sse_sub(buf3[i],buf3[i]); break;
748                     case 2:
749                         correct[2][i] = sse_add(buf4[i],buf3[i]); break;
750                     case 3:
751                         correct[3][i] = sse_sub(buf3[i],buf3[i]); break;
752                     case 4:
753                         correct[4][i] = -sse_add(buf3[i],buf4[i]); break;
754                     case 5:
755                         correct[5][i] = -sse_sub(buf3[i],buf3[i]); break;
756                     case 6:
757                         correct[6][i] = -sse_add(buf4[i],buf3[i]); break;
758                     case 7:
759                         correct[7][i] = -sse_sub(buf3[i],buf3[i]); break;
760 #else
761                     case 0:
762                         correct[0][i] = buf3[i] + buf4[i]; break;
763                     case 1:
764                         correct[1][i] = buf3[i] - buf3[i]; break;
765                     case 2:
766                         correct[2][i] = buf4[i] + buf3[i]; break;
767                     case 3:
768                         correct[3][i] = buf3[i] - buf3[i]; break;
769                     case 4:
770                         correct[4][i] = -(buf3[i] + buf4[i]); break;
771                     case 5:
772                         correct[5][i] = -(buf3[i] - buf3[i]); break;
773                     case 6:
774                         correct[6][i] = -(buf4[i] + buf3[i]); break;
775                     case 7:
776                         correct[7][i] = -(buf3[i] - buf3[i]); break;
777 #endif
778                 }
779                 // Further skip test inputs if the device doesn support infinities AND NaNs
780                 // resulting sum overflows
781                 skipTest[j][i] = (bufSkip[i] ||
782                                   (gSkipNanInf && (FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)))));
783 
784 #if defined(__PPC__)
785                 // Since the current Power processors don't emulate flush to zero in HW,
786                 // it must be emulated in SW instead.
787                 if (gForceFTZ)
788                 {
789                     if ((fabsf(correct[j][i]) < FLT_MIN) && (correct[j][i] != 0.0f))
790                         correct[j][i] = copysignf(0.0f, correct[j][i]);
791                 }
792 #endif
793             }
794         }
795         if( gHasDouble )
796         {
797             // Spec requires correct non-flushed results
798             // for doubles. We disable FTZ if this is default on
799             // the platform (like ARM) for reference result computation
800             // It is no-op if platform default is not FTZ (e.g. x86)
801             FPU_mode_type oldMode;
802             DisableFTZ( &oldMode );
803 
804             buf3_double = (double *)malloc( BUFFER_SIZE );
805             buf4_double = (double *)malloc( BUFFER_SIZE );
806             buf5_double = (double *)malloc( BUFFER_SIZE );
807             buf6_double = (double *)malloc( BUFFER_SIZE );
808             if( NULL == buf3_double || NULL == buf4_double || NULL == buf5_double || NULL == buf6_double )
809             {
810                 vlog_error( "Out of memory initializing DP buffers\n" );
811                 return TEST_FAIL;
812             }
813             for( i = 0; i < sizeof( correct_double ) / sizeof( correct_double[0] ); i++ )
814             {
815                 correct_double[i] = (double *)malloc( BUFFER_SIZE );
816                 if( NULL == correct_double[i] )
817                 {
818                     vlog_error( "Out of memory initializing DP buffers 2\n" );
819                     return TEST_FAIL;
820                 }
821             }
822 
823 
824             double *f  = (double*) buf1;
825             double *f2 = (double*) buf2;
826             double *f3 = (double*) buf3_double;
827             double *f4 = (double*) buf4_double;
828             for( i = 0; i < BUFFER_SIZE / sizeof(double); i++ )
829             {
830                 double q = f[i];
831                 double q2 = f2[i];
832 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
833                 // VS2005 might use x87 for straight multiplies, and we can't
834                 // turn that off
835                 f3[i] = sse_mul_sd(q, q2);
836                 f4[i] = sse_mul_sd(-q, q2);
837 #else
838                 f3[i] = q * q2;
839                 f4[i] = -q * q2;
840 #endif
841             }
842 
843             if( (error = clEnqueueWriteBuffer(gQueue, bufC_double, CL_FALSE, 0, BUFFER_SIZE, buf3_double, 0, NULL, NULL) ))
844             {
845                 vlog_error( "Failure %d at clEnqueueWriteBuffer3\n", error );
846                 return TEST_FAIL;
847             }
848             if( (error = clEnqueueWriteBuffer(gQueue, bufD_double, CL_FALSE, 0, BUFFER_SIZE, buf4_double, 0, NULL, NULL) ))
849             {
850                 vlog_error( "Failure %d at clEnqueueWriteBuffer4\n", error );
851                 return TEST_FAIL;
852             }
853 
854             // Fill the buffers with NaN
855             double *f5 = (double*) buf5_double;
856             double nan_val = nanf("");
857             for( i = 0; i < BUFFER_SIZE / sizeof( double ); i++ )
858                 f5[i] = nan_val;
859 
860             // calculate reference results
861             for( i = 0; i < BUFFER_SIZE / sizeof( double ); i++ )
862             {
863 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
864                 // VS2005 might use x87 for straight add/sub, and we can't
865                 // turn that off
866                 correct_double[0][i] = sse_add_sd(buf3_double[i],buf4_double[i]);
867                 correct_double[1][i] = sse_sub_sd(buf3_double[i],buf3_double[i]);
868                 correct_double[2][i] = sse_add_sd(buf4_double[i],buf3_double[i]);
869                 correct_double[3][i] = sse_sub_sd(buf3_double[i],buf3_double[i]);
870                 correct_double[4][i] = -sse_add_sd(buf3_double[i],buf4_double[i]);
871                 correct_double[5][i] = -sse_sub_sd(buf3_double[i],buf3_double[i]);
872                 correct_double[6][i] = -sse_add_sd(buf4_double[i],buf3_double[i]);
873                 correct_double[7][i] = -sse_sub_sd(buf3_double[i],buf3_double[i]);
874 #else
875                 correct_double[0][i] = buf3_double[i] + buf4_double[i];
876                 correct_double[1][i] = buf3_double[i] - buf3_double[i];
877                 correct_double[2][i] = buf4_double[i] + buf3_double[i];
878                 correct_double[3][i] = buf3_double[i] - buf3_double[i];
879                 correct_double[4][i] = -(buf3_double[i] + buf4_double[i]);
880                 correct_double[5][i] = -(buf3_double[i] - buf3_double[i]);
881                 correct_double[6][i] = -(buf4_double[i] + buf3_double[i]);
882                 correct_double[7][i] = -(buf3_double[i] - buf3_double[i]);
883 #endif
884             }
885 
886             // Restore previous FP state since we modified it for
887             // reference result computation (see DisableFTZ call above)
888             RestoreFPState(&oldMode);
889         }
890     }
891 
892     char c[1000];
893     static const char *no_yes[] = { "NO", "YES" };
894     vlog( "\nCompute Device info:\n" );
895     clGetDeviceInfo( device,  CL_DEVICE_NAME, sizeof(c), (void *)&c, NULL);
896     vlog( "\tDevice Name: %s\n", c );
897     clGetDeviceInfo( device,  CL_DEVICE_VENDOR, sizeof(c), (void *)&c, NULL);
898     vlog( "\tVendor: %s\n", c );
899     clGetDeviceInfo( device,  CL_DEVICE_VERSION, sizeof(c), (void *)&c, NULL);
900     vlog( "\tDevice Version: %s\n", c );
901     clGetDeviceInfo( device, CL_DEVICE_OPENCL_C_VERSION, sizeof(c), &c, NULL);
902     vlog( "\tCL C Version: %s\n", c );
903     clGetDeviceInfo( device,  CL_DRIVER_VERSION, sizeof(c), (void *)&c, NULL);
904     vlog( "\tDriver Version: %s\n", c );
905     vlog( "\tSubnormal values supported? %s\n", no_yes[0 != (CL_FP_DENORM & floatCapabilities)] );
906     vlog( "\tTesting with FTZ mode ON? %s\n", no_yes[0 != gForceFTZ] );
907     vlog( "\tTesting Doubles? %s\n", no_yes[0 != gHasDouble] );
908     vlog( "\tRandom Number seed: 0x%8.8x\n", gSeed );
909     vlog( "\n\n" );
910 
911     return TEST_PASS;
912 }
913 
ReleaseCL(void)914 static void ReleaseCL( void )
915 {
916     clReleaseMemObject(bufA);
917     clReleaseMemObject(bufB);
918     clReleaseMemObject(bufC);
919     clReleaseMemObject(bufD);
920     clReleaseMemObject(bufE);
921     clReleaseProgram(gProgram[0]);
922     clReleaseProgram(gProgram[1]);
923     clReleaseProgram(gProgram[2]);
924     clReleaseProgram(gProgram[3]);
925     clReleaseProgram(gProgram[4]);
926     if( gHasDouble )
927     {
928         clReleaseMemObject(bufC_double);
929         clReleaseMemObject(bufD_double);
930         clReleaseProgram(gProgram_double[0]);
931         clReleaseProgram(gProgram_double[1]);
932         clReleaseProgram(gProgram_double[2]);
933         clReleaseProgram(gProgram_double[3]);
934         clReleaseProgram(gProgram_double[4]);
935     }
936     clReleaseCommandQueue(gQueue);
937     clReleaseContext(gContext);
938 }
939 
940 
RunTest(int testNumber)941 static int RunTest( int testNumber )
942 {
943     size_t i;
944     int error = 0;
945     cl_mem args[4];
946     float *c;
947     const char *kernelName[] = { "kernel1", "kernel2", "kernel3", "kernel4",
948         "kernel5", "kernel6", "kernel7", "kernel8" };
949     switch( testNumber )
950     {
951         case 0:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD;     c = buf4;   break;      // a * b + c
952         case 1:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC;     c = buf3;   break;
953         case 2:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD;     c = buf4;   break;
954         case 3:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC;     c = buf3;   break;
955         case 4:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD;     c = buf4;   break;
956         case 5:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC;     c = buf3;   break;
957         case 6:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD;     c = buf4;   break;
958         case 7:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC;     c = buf3;   break;
959         default:
960             vlog_error( "Unknown test case %d passed to RunTest\n", testNumber );
961             return -1;
962     }
963 
964 
965     int vectorSize;
966     for( vectorSize = 0; vectorSize < 5; vectorSize++ )
967     {
968         cl_kernel k = clCreateKernel( gProgram[ vectorSize ], kernelName[ testNumber ], &error );
969         if( NULL == k || error )
970         {
971             vlog_error( "%d) Unable to find kernel \"%s\" for vector size: %d\n", error, kernelName[ testNumber ], 1 << vectorSize );
972             return -2;
973         }
974 
975         // set the kernel args
976         for( i = 0; i < sizeof(args ) / sizeof( args[0]); i++ )
977             if( (error = clSetKernelArg(k, i, sizeof( cl_mem ), args + i) ))
978             {
979                 vlog_error( "Error %d setting kernel arg # %ld\n", error, i );
980                 return error;
981             }
982 
983         // write NaNs to the result array
984         if( (error = clEnqueueWriteBuffer(gQueue, bufE, CL_TRUE, 0, BUFFER_SIZE, buf5, 0, NULL, NULL) ))
985         {
986             vlog_error( "Failure %d at clWriteArray %d\n", error, testNumber );
987             return error;
988         }
989 
990         // execute the kernel
991         size_t gDim[3] = { BUFFER_SIZE / (sizeof( cl_float ) * (1<<vectorSize)), 0, 0 };
992         if( ((error = clEnqueueNDRangeKernel(gQueue, k, 1, NULL, gDim, NULL, 0, NULL, NULL) )))
993         {
994             vlog_error( "Got Error # %d trying to execture kernel\n", error );
995             return error;
996         }
997 
998         // read the data back
999         if( (error = clEnqueueReadBuffer(gQueue, bufE, CL_TRUE, 0, BUFFER_SIZE, buf6, 0, NULL, NULL ) ))
1000         {
1001             vlog_error( "Failure %d at clReadArray %d\n", error, testNumber );
1002             return error;
1003         }
1004 
1005         // verify results
1006         float *test = (float*) buf6;
1007         float *a = (float*) buf1;
1008         float *b = (float*) buf2;
1009         for( i = 0; i < BUFFER_SIZE / sizeof( float ); i++ )
1010         {
1011             if( isnan(test[i]) && isnan(correct[testNumber][i] ) )
1012                 continue;
1013 
1014             if( skipTest[testNumber][i] )
1015                 continue;
1016 
1017             // sign of zero must be correct
1018             if(( ((uint32_t*) test)[i] != ((uint32_t*) correct[testNumber])[i] ) &&
1019                !(gIgnoreZeroSign && (test[i] == 0.0f) && (correct[testNumber][i] == 0.0f)) )
1020             {
1021                 switch( testNumber )
1022                 {
1023                         // Zeros for these should be positive
1024                     case 0:     vlog_error( "%ld) Error for %s %s: %a * %a + %a =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1025                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1026                     case 1:     vlog_error( "%ld) Error for %s %s: %a * %a - %a =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1027                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1028                     case 2:     vlog_error( "%ld) Error for %s %s: %a + %a * %a =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1029                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1030                     case 3:     vlog_error( "%ld) Error for %s %s: %a - %a * %a =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1031                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1032 
1033                         // Zeros for these should be negative
1034                     case 4:     vlog_error( "%ld) Error for %s %s: -(%a * %a + %a) =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1035                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1036                     case 5:     vlog_error( "%ld) Error for %s %s: -(%a * %a - %a) =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1037                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1038                     case 6:     vlog_error( "%ld) Error for %s %s: -(%a + %a * %a) =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1039                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1040                     case 7:     vlog_error( "%ld) Error for %s %s: -(%a - %a * %a) =  *%a vs. %a\n", i, sizeNames[ vectorSize], kernelName[ testNumber ],
1041                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       clReleaseKernel(k); return -1;
1042                     default:
1043                         vlog_error( "error: Unknown test number!\n" );
1044                         clReleaseKernel(k);
1045                         return -2;
1046                 }
1047             }
1048         }
1049 
1050         clReleaseKernel(k);
1051     }
1052 
1053     return error;
1054 }
1055 
RunTest_Double(int testNumber)1056 static int RunTest_Double( int testNumber )
1057 {
1058     if( !gHasDouble )
1059     {
1060         vlog("Double is not supported, test not run.\n");
1061         return 0;
1062     }
1063 
1064     size_t i;
1065     int error = 0;
1066     cl_mem args[4];
1067     double *c;
1068     const char *kernelName[] = { "kernel1", "kernel2", "kernel3", "kernel4",
1069         "kernel5", "kernel6", "kernel7", "kernel8" };
1070 
1071     switch( testNumber )
1072     {
1073         case 0:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD_double;     c = buf4_double;   break;      // a * b + c
1074         case 1:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC_double;     c = buf3_double;   break;
1075         case 2:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD_double;     c = buf4_double;   break;
1076         case 3:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC_double;     c = buf3_double;   break;
1077         case 4:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD_double;     c = buf4_double;   break;
1078         case 5:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC_double;     c = buf3_double;   break;
1079         case 6:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufD_double;     c = buf4_double;   break;
1080         case 7:     args[0] = bufE;     args[1] = bufA;     args[2] = bufB;     args[3] = bufC_double;     c = buf3_double;   break;
1081         default:
1082             vlog_error( "Unknown test case %d passed to RunTest\n", testNumber );
1083             return -1;
1084     }
1085 
1086     int vectorSize;
1087     for( vectorSize = 0; vectorSize < 5; vectorSize++ )
1088     {
1089         cl_kernel k = clCreateKernel( gProgram_double[ vectorSize ], kernelName[ testNumber ], &error );
1090         if( NULL == k || error )
1091         {
1092             vlog_error( "%d) Unable to find kernel \"%s\" for vector size: %d\n", error, kernelName[ testNumber ], 1 << vectorSize );
1093             return -2;
1094         }
1095 
1096         // set the kernel args
1097         for( i = 0; i < sizeof(args ) / sizeof( args[0]); i++ )
1098             if( (error = clSetKernelArg(k, i, sizeof( cl_mem ), args + i) ))
1099             {
1100                 vlog_error( "Error %d setting kernel arg # %ld\n", error, i );
1101                 return error;
1102             }
1103 
1104         // write NaNs to the result array
1105         if( (error = clEnqueueWriteBuffer(gQueue, bufE, CL_FALSE, 0, BUFFER_SIZE, buf5_double, 0, NULL, NULL) ))
1106         {
1107             vlog_error( "Failure %d at clWriteArray %d\n", error, testNumber );
1108             return error;
1109         }
1110 
1111         // execute the kernel
1112         size_t gDim[3] = { BUFFER_SIZE / (sizeof( cl_double ) * (1<<vectorSize)), 0, 0 };
1113         if( ((error = clEnqueueNDRangeKernel(gQueue, k, 1, NULL, gDim, NULL, 0, NULL, NULL) )))
1114         {
1115             vlog_error( "Got Error # %d trying to execture kernel\n", error );
1116             return error;
1117         }
1118 
1119         // read the data back
1120         if( (error = clEnqueueReadBuffer(gQueue, bufE, CL_TRUE, 0, BUFFER_SIZE, buf6_double, 0, NULL, NULL ) ))
1121         {
1122             vlog_error( "Failure %d at clReadArray %d\n", error, testNumber );
1123             return error;
1124         }
1125 
1126         // verify results
1127         double *test = (double*) buf6_double;
1128         double *a = (double*) buf1;
1129         double *b = (double*) buf2;
1130         for( i = 0; i < BUFFER_SIZE / sizeof( double ); i++ )
1131         {
1132             if( isnan(test[i]) && isnan(correct_double[testNumber][i] ) )
1133                 continue;
1134 
1135             // sign of zero must be correct
1136             if( ((uint64_t*) test)[i] != ((uint64_t*) correct_double[testNumber])[i] )
1137             {
1138                 switch( testNumber )
1139                 {
1140                         // Zeros for these should be positive
1141                     case 0:     vlog_error( "%ld) Error for %s %s: %a * %a + %a =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1142                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       return -1;
1143                     case 1:     vlog_error( "%ld) Error for %s %s: %a * %a - %a =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1144                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       return -1;
1145                     case 2:     vlog_error( "%ld) Error for %s %s: %a + %a * %a =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1146                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       return -1;
1147                     case 3:     vlog_error( "%ld) Error for %s %s: %a - %a * %a =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1148                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       return -1;
1149 
1150                         // Zeros for these should be negative
1151                     case 4:     vlog_error( "%ld) Error for %s %s: -(%a * %a + %a) =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1152                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       return -1;
1153                     case 5:     vlog_error( "%ld) Error for %s %s: -(%a * %a - %a) =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1154                                            a[i], b[i], c[i], correct[testNumber][i], test[i] );       return -1;
1155                     case 6:     vlog_error( "%ld) Error for %s %s: -(%a + %a * %a) =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1156                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       return -1;
1157                     case 7:     vlog_error( "%ld) Error for %s %s: -(%a - %a * %a) =  *%a vs. %a\n", i, sizeNames_double[ vectorSize], kernelName[ testNumber ],
1158                                            c[i], a[i], b[i], correct[testNumber][i], test[i] );       return -1;
1159                     default:
1160                         vlog_error( "error: Unknown test number!\n" );
1161                         return -2;
1162                 }
1163             }
1164         }
1165 
1166         clReleaseKernel(k);
1167     }
1168 
1169     return error;
1170 }
1171