xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/half/Test_vLoadHalf.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 #include "harness/testHarness.h"
18 
19 #include <string.h>
20 
21 #include <algorithm>
22 
23 #include "cl_utils.h"
24 #include "tests.h"
25 
26 #include <CL/cl_half.h>
27 
Test_vLoadHalf_private(cl_device_id device,bool aligned)28 int Test_vLoadHalf_private( cl_device_id device, bool aligned )
29 {
30     cl_int error;
31     int vectorSize;
32     cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount][AS_NumAddressSpaces] = {{0}};
33     cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount][AS_NumAddressSpaces] = {{0}};
34     uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
35     uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
36     size_t q;
37 
38     memset( min_time, -1, sizeof( min_time ) );
39 
40     const char *vector_size_names[]   = {"1", "2", "4", "8", "16", "3"};
41 
42     int minVectorSize = kMinVectorSize;
43 
44     // There is no aligned scalar vloada_half
45     if (aligned && minVectorSize == 0) minVectorSize = 1;
46 
47     for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
48          vectorSize++)
49     {
50 
51         int effectiveVectorSize = g_arrVecSizes[vectorSize];
52         if(effectiveVectorSize == 3 && aligned) {
53             effectiveVectorSize = 4;
54         }
55         const char *source[] = {
56             "__kernel void test( const __global half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
57             "{\n"
58             "   size_t i = get_global_id(0);\n"
59             "   f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( i, p );\n"
60             "}\n"
61         };
62 
63         const char *sourceV3[] = {
64             "__kernel void test( const __global half *p, __global float *f,\n"
65             "                   uint extra_last_thread)\n"
66             "{\n"
67             "   size_t i = get_global_id(0);\n"
68             "   size_t last_i = get_global_size(0)-1;\n"
69             "   if(last_i == i && extra_last_thread != 0) {\n"
70             "     if(extra_last_thread ==2) {\n"
71             "       f[3*i+1] = vload_half(3*i+1, p);\n"
72             "     }\n"
73             "     f[3*i] = vload_half(3*i, p);\n"
74             "   } else {\n"
75             "     vstore3(vload_half3( i, p ),i,f);\n"
76             "   }\n"
77             "}\n"
78         };
79 
80         const char *sourceV3aligned[] = {
81             "__kernel void test( const __global half *p, __global float3 *f )\n"
82             "{\n"
83             "   size_t i = get_global_id(0);\n"
84             "   f[i] = vloada_half3( i, p );\n"
85             "   ((__global float *)f)[4*i+3] = vload_half(4*i+3,p);\n"
86             "}\n"
87         };
88 
89         const char *source_private1[] = {
90             "__kernel void test( const __global half *p, __global float *f )\n"
91             "{\n"
92             "   __private ushort data[1];\n"
93             "   __private half* hdata_p = (__private half*) data;\n"
94             "   size_t i = get_global_id(0);\n"
95             "   data[0] = ((__global ushort*)p)[i];\n"
96             "   f[i] = vload", (aligned ? "a" : ""), "_half( 0, hdata_p );\n"
97             "}\n"
98         };
99 
100         const char *source_private2[] = {
101             "__kernel void test( const __global half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
102             "{\n"
103             "   __private ", align_types[vectorSize], " data[", vector_size_names[vectorSize], "/", align_divisors[vectorSize], "];\n"
104             "   __private half* hdata_p = (__private half*) data;\n"
105             "   __global  ", align_types[vectorSize], "* i_p = (__global ", align_types[vectorSize], "*)p;\n"
106             "   size_t i = get_global_id(0);\n"
107             "   int k;\n"
108             "   for (k=0; k<",vector_size_names[vectorSize],"/",align_divisors[vectorSize],"; k++)\n"
109             "     data[k] = i_p[i+k];\n"
110             "   f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( 0, hdata_p );\n"
111             "}\n"
112         };
113 
114         const char *source_privateV3[] = {
115             "__kernel void test( const __global half *p, __global float *f,"
116             "                    uint extra_last_thread )\n"
117             "{\n"
118             "   __private ushort data[3];\n"
119             "   __private half* hdata_p = (__private half*) data;\n"
120             "   __global  ushort* i_p = (__global  ushort*)p;\n"
121             "   size_t i = get_global_id(0);\n"
122             "   int k;\n"
123             //        "   data = vload3(i, i_p);\n"
124             "   size_t last_i = get_global_size(0)-1;\n"
125             "   if(last_i == i && extra_last_thread != 0) {\n"
126             "     if(extra_last_thread ==2) {\n"
127             "       f[3*i+1] = vload_half(3*i+1, p);\n"
128             "     }\n"
129             "     f[3*i] = vload_half(3*i, p);\n"
130             "   } else {\n"
131             "     for (k=0; k<3; k++)\n"
132             "       data[k] = i_p[i*3+k];\n"
133             "     vstore3(vload_half3( 0, hdata_p ), i, f);\n"
134             "   }\n"
135             "}\n"
136         };
137 
138         const char *source_privateV3aligned[] = {
139             "__kernel void test( const __global half *p, __global float3 *f )\n"
140             "{\n"
141             "   ushort4 data[4];\n"  // declare as vector for alignment. Make four to check to see vloada_half3 index is working.
142             "   half* hdata_p = (half*) &data;\n"
143             "   size_t i = get_global_id(0);\n"
144             "   global  ushort* i_p = (global  ushort*)p + i * 4;\n"
145             "   int offset = i & 3;\n"
146             "   data[offset] = (ushort4)( i_p[0], i_p[1], i_p[2], USHRT_MAX ); \n"
147             "   data[offset^1] = USHRT_MAX; \n"
148             "   data[offset^2] = USHRT_MAX; \n"
149             "   data[offset^3] = USHRT_MAX; \n"
150             //  test vloada_half3
151             "   f[i] = vloada_half3( offset, hdata_p );\n"
152             //  Fill in the 4th value so we don't have to special case this code elsewhere in the test.
153             "   mem_fence(CLK_GLOBAL_MEM_FENCE );\n"
154             "   ((__global float *)f)[4*i+3] = vload_half(4*i+3, p);\n"
155             "}\n"
156         };
157 
158         char local_buf_size[10];
159 
160         sprintf(local_buf_size, "%lld", (uint64_t)((effectiveVectorSize))*gWorkGroupSize);
161         const char *source_local1[] = {
162             "__kernel void test( const __global half *p, __global float *f )\n"
163             "{\n"
164             "   __local ushort data[",local_buf_size,"];\n"
165             "   __local half* hdata_p = (__local half*) data;\n"
166             "   size_t i = get_global_id(0);\n"
167             "   size_t lid = get_local_id(0);\n"
168             "   data[lid] = ((__global ushort*)p)[i];\n"
169             "   f[i] = vload", aligned ? "a" : "", "_half( lid, hdata_p );\n"
170             "}\n"
171         };
172 
173         const char *source_local2[] = {
174             "#define VECTOR_LEN (",
175             vector_size_names[vectorSize],
176             "/",
177             align_divisors[vectorSize],
178             ")\n"
179             "#define ALIGN_TYPE ",
180             align_types[vectorSize],
181             "\n"
182             "__kernel void test( const __global half *p, __global float",
183             vector_size_name_extensions[vectorSize],
184             " *f )\n"
185             "{\n"
186             "   __local uchar data[",
187             local_buf_size,
188             "/",
189             align_divisors[vectorSize],
190             "*sizeof(ALIGN_TYPE)] ",
191             "__attribute__((aligned(sizeof(ALIGN_TYPE))));\n"
192             "   __local half* hdata_p = (__local half*) data;\n"
193             "   __global ALIGN_TYPE* i_p = (__global ALIGN_TYPE*)p;\n"
194             "   size_t i = get_global_id(0);\n"
195             "   size_t lid = get_local_id(0);\n"
196             "   int k;\n"
197             "   for (k=0; k<VECTOR_LEN; k++)\n"
198             "     *(__local ",
199             "ALIGN_TYPE*)&(data[(lid*VECTOR_LEN+k)*sizeof(ALIGN_TYPE)]) = ",
200             "i_p[i*VECTOR_LEN+k];\n"
201             "   f[i] = vload",
202             aligned ? "a" : "",
203             "_half",
204             vector_size_name_extensions[vectorSize],
205             "( lid, hdata_p );\n"
206             "}\n"
207         };
208 
209         const char *source_localV3[] = {
210             "__kernel void test( const __global half *p, __global float *f,\n"
211             "                    uint extra_last_thread)\n"
212             "{\n"
213             "   __local ushort data[", local_buf_size,"];\n"
214             "   __local half* hdata_p = (__local half*) data;\n"
215             "   __global  ushort* i_p = (__global  ushort*)p;\n"
216             "   size_t i = get_global_id(0);\n"
217             "   size_t last_i = get_global_size(0)-1;\n"
218             "   size_t lid = get_local_id(0);\n"
219             "   int k;\n"
220             "   if(last_i == i && extra_last_thread != 0) {\n"
221             "     if(extra_last_thread ==2) {\n"
222             "       f[3*i+1] = vload_half(3*i+1, p);\n"
223             "     }\n"
224             "     f[3*i] = vload_half(3*i, p);\n"
225             "   } else {\n"
226             "     for (k=0; k<3; k++)\n"
227             "       data[lid*3+k] = i_p[i*3+k];\n"
228             "     vstore3( vload_half3( lid, hdata_p ),i,f);\n"
229             "   };\n"
230             "}\n"
231         };
232 
233         const char *source_localV3aligned[] = {
234             "__kernel void test( const __global half *p, __global float3 *f )\n"
235             "{\n"
236             "   __local ushort data[", local_buf_size,"];\n"
237             "   __local half* hdata_p = (__local half*) data;\n"
238             "   __global  ushort* i_p = (__global  ushort*)p;\n"
239             "   size_t i = get_global_id(0);\n"
240             "   size_t lid = get_local_id(0);\n"
241             "   int k;\n"
242             "   for (k=0; k<4; k++)\n"
243             "     data[lid*4+k] = i_p[i*4+k];\n"
244             "   f[i] = vloada_half3( lid, hdata_p );\n"
245             "   ((__global float *)f)[4*i+3] = vload_half(lid*4+3, hdata_p);\n"
246             "}\n"
247         };
248 
249         const char *source_constant[] = {
250             "__kernel void test( __constant half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
251             "{\n"
252             "   size_t i = get_global_id(0);\n"
253             "   f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( i, p );\n"
254             "}\n"
255         };
256 
257         const char *source_constantV3[] = {
258             "__kernel void test( __constant half *p, __global float *f,\n"
259             "                    uint extra_last_thread)\n"
260             "{\n"
261             "   size_t i = get_global_id(0);\n"
262             "   size_t last_i = get_global_size(0)-1;\n"
263             "   if(last_i == i && extra_last_thread != 0) {\n"
264             "     if(extra_last_thread ==2) {\n"
265             "       f[3*i+1] = vload_half(3*i+1, p);\n"
266             "     }\n"
267             "     f[3*i] = vload_half(3*i, p);\n"
268             "   } else {\n"
269             "     vstore3(vload_half",vector_size_name_extensions[vectorSize],"( i, p ), i, f);\n"
270             "   }\n"
271             "}\n"
272         };
273 
274         const char *source_constantV3aligned[] = {
275             "__kernel void test( __constant half *p, __global float3 *f )\n"
276             "{\n"
277             "   size_t i = get_global_id(0);\n"
278             "   f[i] = vloada_half3( i, p );\n"
279             "   ((__global float *)f)[4*i+3] = vload_half(4*i+3,p);\n"
280             "}\n"
281         };
282 
283 
284         if(g_arrVecSizes[vectorSize] != 3) {
285             programs[vectorSize][AS_Global] = MakeProgram( device, source, sizeof( source) / sizeof( source[0])  );
286             if( NULL == programs[ vectorSize ][AS_Global] ) {
287                 gFailCount++;
288                 vlog_error( "\t\tFAILED -- Failed to create program.\n" );
289                 for ( q= 0; q < sizeof( source) / sizeof( source[0]); q++)
290                     vlog_error("%s", source[q]);
291                 return -1;
292             } else {
293             }
294         } else if(aligned) {
295             programs[vectorSize][AS_Global] = MakeProgram( device, sourceV3aligned, sizeof( sourceV3aligned) / sizeof( sourceV3aligned[0])  );
296             if( NULL == programs[ vectorSize ][AS_Global] ) {
297                 gFailCount++;
298                 vlog_error( "\t\tFAILED -- Failed to create program.\n" );
299                 for ( q= 0; q < sizeof( sourceV3aligned) / sizeof( sourceV3aligned[0]); q++)
300                     vlog_error("%s", sourceV3aligned[q]);
301                 return -1;
302             } else {
303             }
304         } else {
305             programs[vectorSize][AS_Global] = MakeProgram( device, sourceV3, sizeof( sourceV3) / sizeof( sourceV3[0])  );
306             if( NULL == programs[ vectorSize ][AS_Global] ) {
307                 gFailCount++;
308                 vlog_error( "\t\tFAILED -- Failed to create program.\n" );
309                 for ( q= 0; q < sizeof( sourceV3) / sizeof( sourceV3[0]); q++)
310                     vlog_error("%s", sourceV3[q]);
311                 return -1;
312             }
313         }
314 
315         kernels[ vectorSize ][AS_Global] = clCreateKernel( programs[ vectorSize ][AS_Global], "test", &error );
316         if( NULL == kernels[vectorSize][AS_Global] )
317         {
318             gFailCount++;
319             vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
320             return -2;
321         }
322 
323         const char** source_ptr;
324         uint32_t source_size;
325         if (vectorSize == 0) {
326             source_ptr = source_private1;
327             source_size = sizeof( source_private1) / sizeof( source_private1[0]);
328         } else if(g_arrVecSizes[vectorSize] == 3) {
329             if(aligned) {
330                 source_ptr = source_privateV3aligned;
331                 source_size = sizeof( source_privateV3aligned) / sizeof( source_privateV3aligned[0]);
332             } else {
333                 source_ptr = source_privateV3;
334                 source_size = sizeof( source_privateV3) / sizeof( source_privateV3[0]);
335             }
336         } else {
337             source_ptr = source_private2;
338             source_size = sizeof( source_private2) / sizeof( source_private2[0]);
339         }
340         programs[vectorSize][AS_Private] = MakeProgram( device, source_ptr, source_size );
341         if( NULL == programs[ vectorSize ][AS_Private] )
342         {
343             gFailCount++;
344             vlog_error( "\t\tFAILED -- Failed to create private program.\n" );
345             for ( q= 0; q < source_size; q++)
346                 vlog_error("%s", source_ptr[q]);
347             return -1;
348         }
349 
350         kernels[ vectorSize ][AS_Private] = clCreateKernel( programs[ vectorSize ][AS_Private], "test", &error );
351         if( NULL == kernels[vectorSize][AS_Private] )
352         {
353             gFailCount++;
354             vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
355             return -2;
356         }
357 
358         if (vectorSize == 0) {
359             source_ptr = source_local1;
360             source_size = sizeof( source_local1) / sizeof( source_local1[0]);
361         } else if(g_arrVecSizes[vectorSize] == 3) {
362             if(aligned) {
363                 source_ptr = source_localV3aligned;
364                 source_size = sizeof(source_localV3aligned)/sizeof(source_localV3aligned[0]);
365             } else  {
366                 source_ptr = source_localV3;
367                 source_size = sizeof(source_localV3)/sizeof(source_localV3[0]);
368             }
369         } else {
370             source_ptr = source_local2;
371             source_size = sizeof( source_local2) / sizeof( source_local2[0]);
372         }
373         programs[vectorSize][AS_Local] = MakeProgram( device, source_ptr, source_size );
374         if( NULL == programs[ vectorSize ][AS_Local] )
375         {
376             gFailCount++;
377             vlog_error( "\t\tFAILED -- Failed to create local program.\n" );
378             for ( q= 0; q < source_size; q++)
379                 vlog_error("%s", source_ptr[q]);
380             return -1;
381         }
382 
383         kernels[ vectorSize ][AS_Local] = clCreateKernel( programs[ vectorSize ][AS_Local], "test", &error );
384         if( NULL == kernels[vectorSize][AS_Local] )
385         {
386             gFailCount++;
387             vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
388             return -2;
389         }
390 
391         if(g_arrVecSizes[vectorSize] == 3) {
392             if(aligned) {
393                 programs[vectorSize][AS_Constant] = MakeProgram( device, source_constantV3aligned, sizeof(source_constantV3aligned) / sizeof( source_constantV3aligned[0])  );
394                 if( NULL == programs[ vectorSize ][AS_Constant] )
395                 {
396                     gFailCount++;
397                     vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
398                     for ( q= 0; q < sizeof( source_constantV3aligned) / sizeof( source_constantV3aligned[0]); q++)
399                         vlog_error("%s", source_constantV3aligned[q]);
400                     return -1;
401                 }
402             } else {
403                 programs[vectorSize][AS_Constant] = MakeProgram( device, source_constantV3, sizeof(source_constantV3) / sizeof( source_constantV3[0])  );
404                 if( NULL == programs[ vectorSize ][AS_Constant] )
405                 {
406                     gFailCount++;
407                     vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
408                     for ( q= 0; q < sizeof( source_constantV3) / sizeof( source_constantV3[0]); q++)
409                         vlog_error("%s", source_constantV3[q]);
410                     return -1;
411                 }
412             }
413         } else {
414             programs[vectorSize][AS_Constant] = MakeProgram( device, source_constant, sizeof(source_constant) / sizeof( source_constant[0])  );
415             if( NULL == programs[ vectorSize ][AS_Constant] )
416             {
417                 gFailCount++;
418                 vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
419                 for ( q= 0; q < sizeof( source_constant) / sizeof( source_constant[0]); q++)
420                     vlog_error("%s", source_constant[q]);
421                 return -1;
422             }
423         }
424 
425         kernels[ vectorSize ][AS_Constant] = clCreateKernel( programs[ vectorSize ][AS_Constant], "test", &error );
426         if( NULL == kernels[vectorSize][AS_Constant] )
427         {
428             gFailCount++;
429             vlog_error( "\t\tFAILED -- Failed to create constant kernel. (%d)\n", error );
430             return -2;
431         }
432     }
433 
434     // Figure out how many elements are in a work block
435     size_t elementSize = std::max(sizeof(cl_half), sizeof(cl_float));
436     size_t blockCount = getBufferSize(device) / elementSize; // elementSize is power of 2
437     uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of things of size cl_half
438 
439     // we handle 64-bit types a bit differently.
440     if( lastCase == 0 )
441         lastCase = 0x100000000ULL;
442 
443 
444     uint64_t i, j;
445     uint64_t printMask = (lastCase >> 4) - 1;
446     uint32_t count = 0;
447     error = 0;
448     int addressSpace;
449     //    int reported_vector_skip = 0;
450 
451     for( i = 0; i < (uint64_t)lastCase; i += blockCount )
452     {
453         count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
454 
455         //Init the input stream
456         uint16_t *p = (uint16_t *)gIn_half;
457         for( j = 0; j < count; j++ )
458             p[j] = j + i;
459 
460         if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_half, CL_TRUE, 0, count * sizeof( cl_half ), gIn_half, 0, NULL, NULL)))
461         {
462             vlog_error( "Failure in clWriteArray\n" );
463             gFailCount++;
464             goto exit;
465         }
466 
467         //create the reference result
468         const unsigned short *s = (const unsigned short *)gIn_half;
469         float *d = (float *)gOut_single_reference;
470         for (j = 0; j < count; j++) d[j] = cl_half_to_float(s[j]);
471 
472         //Check the vector lengths
473         for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
474         { // here we loop through vector sizes, 3 is last
475 
476             for ( addressSpace = 0; addressSpace < AS_NumAddressSpaces; addressSpace++) {
477                 uint32_t pattern = 0x7fffdead;
478 
479                 /*
480                  if (addressSpace == 3) {
481                  vlog("Note: skipping address space %s due to small buffer size.\n", addressSpaceNames[addressSpace]);
482                  continue;
483                  }
484                  */
485                 memset_pattern4( gOut_single, &pattern, getBufferSize(device));
486                 if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_single, CL_TRUE, 0, count * sizeof( float ), gOut_single, 0, NULL, NULL)) )
487                 {
488                     vlog_error( "Failure in clWriteArray\n" );
489                     gFailCount++;
490                     goto exit;
491                 }
492 
493                 if(g_arrVecSizes[vectorSize] == 3 && !aligned) {
494                     // now we need to add the extra const argument for how
495                     // many elements the last thread should take care of.
496                 }
497 
498                 // okay, here is where we have to be careful
499                 if( (error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_half, gOutBuffer_single, numVecs(count, vectorSize, aligned) ,
500                                        runsOverBy(count, vectorSize, aligned) ) ) )
501                 {
502                     gFailCount++;
503                     goto exit;
504                 }
505 
506                 if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_single, CL_TRUE, 0, count * sizeof( float ), gOut_single, 0, NULL, NULL)) )
507                 {
508                     vlog_error( "Failure in clReadArray\n" );
509                     gFailCount++;
510                     goto exit;
511                 }
512 
513                 if( memcmp( gOut_single, gOut_single_reference, count * sizeof( float )) )
514                 {
515                     uint32_t *u1 = (uint32_t *)gOut_single;
516                     uint32_t *u2 = (uint32_t *)gOut_single_reference;
517                     float *f1 = (float *)gOut_single;
518                     float *f2 = (float *)gOut_single_reference;
519                     for( j = 0; j < count; j++ )
520                     {
521                         if(isnan(f1[j]) && isnan(f2[j])) // both are nan dont compare them
522                             continue;
523                         if( u1[j] != u2[j])
524                         {
525                             vlog_error( " %lld)  (of %lld) Failure at 0x%4.4x:  %a vs *%a  (0x%8.8x vs *0x%8.8x)  vector_size = %d (%s) address space = %s, load is %s\n",
526                                        j, (uint64_t)count, ((unsigned short*)gIn_half)[j], f1[j], f2[j], u1[j], u2[j], (g_arrVecSizes[vectorSize]),
527                                        vector_size_names[vectorSize], addressSpaceNames[addressSpace],
528                                        (aligned?"aligned":"unaligned"));
529                             gFailCount++;
530                             error = -1;
531                             goto exit;
532                         }
533                     }
534                 }
535 
536                 if( gReportTimes && addressSpace == 0)
537                 {
538                     //Run again for timing
539                     for( j = 0; j < 100; j++ )
540                     {
541                         uint64_t startTime = ReadTime();
542                         error =
543                         RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_half, gOutBuffer_single, numVecs(count, vectorSize, aligned) ,
544                                   runsOverBy(count, vectorSize, aligned));
545                         if(error)
546                         {
547                             gFailCount++;
548                             goto exit;
549                         }
550 
551                         if( (error = clFinish(gQueue)) )
552                         {
553                             vlog_error( "Failure in clFinish\n" );
554                             gFailCount++;
555                             goto exit;
556                         }
557                         uint64_t currentTime = ReadTime() - startTime;
558                         time[ vectorSize ] += currentTime;
559                         if( currentTime < min_time[ vectorSize ] )
560                             min_time[ vectorSize ] = currentTime ;
561                     }
562                 }
563             }
564         }
565 
566         if( ((i+blockCount) & ~printMask) == (i+blockCount) )
567         {
568             vlog( "." );
569             fflush( stdout );
570         }
571     }
572 
573     vlog( "\n" );
574 
575     if( gReportTimes )
576     {
577         for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
578             vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * 100), 0,
579                       "average us/elem", "vLoad%sHalf avg. (%s, vector size: %d)", ( (aligned) ? "a" : ""), addressSpaceNames[0], (g_arrVecSizes[vectorSize])  );
580         for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
581             vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
582                       "best us/elem", "vLoad%sHalf best (%s vector size: %d)", ( (aligned) ? "a" : ""), addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
583     }
584 
585 exit:
586     //clean up
587     for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
588     {
589         for ( addressSpace = 0; addressSpace < AS_NumAddressSpaces; addressSpace++) {
590             clReleaseKernel( kernels[ vectorSize ][addressSpace] );
591             clReleaseProgram( programs[ vectorSize ][addressSpace] );
592         }
593     }
594 
595     return error;
596 }
597 
test_vload_half(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)598 int test_vload_half( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
599 {
600     return Test_vLoadHalf_private( device, false );
601 }
602 
test_vloada_half(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)603 int test_vloada_half( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
604 {
605     return Test_vLoadHalf_private( device, true );
606 }
607 
608