1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 #include "harness/testHarness.h"
18
19 #include <string.h>
20
21 #include <algorithm>
22
23 #include "cl_utils.h"
24 #include "tests.h"
25
26 #include <CL/cl_half.h>
27
Test_vLoadHalf_private(cl_device_id device,bool aligned)28 int Test_vLoadHalf_private( cl_device_id device, bool aligned )
29 {
30 cl_int error;
31 int vectorSize;
32 cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount][AS_NumAddressSpaces] = {{0}};
33 cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount][AS_NumAddressSpaces] = {{0}};
34 uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
35 uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
36 size_t q;
37
38 memset( min_time, -1, sizeof( min_time ) );
39
40 const char *vector_size_names[] = {"1", "2", "4", "8", "16", "3"};
41
42 int minVectorSize = kMinVectorSize;
43
44 // There is no aligned scalar vloada_half
45 if (aligned && minVectorSize == 0) minVectorSize = 1;
46
47 for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
48 vectorSize++)
49 {
50
51 int effectiveVectorSize = g_arrVecSizes[vectorSize];
52 if(effectiveVectorSize == 3 && aligned) {
53 effectiveVectorSize = 4;
54 }
55 const char *source[] = {
56 "__kernel void test( const __global half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
57 "{\n"
58 " size_t i = get_global_id(0);\n"
59 " f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( i, p );\n"
60 "}\n"
61 };
62
63 const char *sourceV3[] = {
64 "__kernel void test( const __global half *p, __global float *f,\n"
65 " uint extra_last_thread)\n"
66 "{\n"
67 " size_t i = get_global_id(0);\n"
68 " size_t last_i = get_global_size(0)-1;\n"
69 " if(last_i == i && extra_last_thread != 0) {\n"
70 " if(extra_last_thread ==2) {\n"
71 " f[3*i+1] = vload_half(3*i+1, p);\n"
72 " }\n"
73 " f[3*i] = vload_half(3*i, p);\n"
74 " } else {\n"
75 " vstore3(vload_half3( i, p ),i,f);\n"
76 " }\n"
77 "}\n"
78 };
79
80 const char *sourceV3aligned[] = {
81 "__kernel void test( const __global half *p, __global float3 *f )\n"
82 "{\n"
83 " size_t i = get_global_id(0);\n"
84 " f[i] = vloada_half3( i, p );\n"
85 " ((__global float *)f)[4*i+3] = vload_half(4*i+3,p);\n"
86 "}\n"
87 };
88
89 const char *source_private1[] = {
90 "__kernel void test( const __global half *p, __global float *f )\n"
91 "{\n"
92 " __private ushort data[1];\n"
93 " __private half* hdata_p = (__private half*) data;\n"
94 " size_t i = get_global_id(0);\n"
95 " data[0] = ((__global ushort*)p)[i];\n"
96 " f[i] = vload", (aligned ? "a" : ""), "_half( 0, hdata_p );\n"
97 "}\n"
98 };
99
100 const char *source_private2[] = {
101 "__kernel void test( const __global half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
102 "{\n"
103 " __private ", align_types[vectorSize], " data[", vector_size_names[vectorSize], "/", align_divisors[vectorSize], "];\n"
104 " __private half* hdata_p = (__private half*) data;\n"
105 " __global ", align_types[vectorSize], "* i_p = (__global ", align_types[vectorSize], "*)p;\n"
106 " size_t i = get_global_id(0);\n"
107 " int k;\n"
108 " for (k=0; k<",vector_size_names[vectorSize],"/",align_divisors[vectorSize],"; k++)\n"
109 " data[k] = i_p[i+k];\n"
110 " f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( 0, hdata_p );\n"
111 "}\n"
112 };
113
114 const char *source_privateV3[] = {
115 "__kernel void test( const __global half *p, __global float *f,"
116 " uint extra_last_thread )\n"
117 "{\n"
118 " __private ushort data[3];\n"
119 " __private half* hdata_p = (__private half*) data;\n"
120 " __global ushort* i_p = (__global ushort*)p;\n"
121 " size_t i = get_global_id(0);\n"
122 " int k;\n"
123 // " data = vload3(i, i_p);\n"
124 " size_t last_i = get_global_size(0)-1;\n"
125 " if(last_i == i && extra_last_thread != 0) {\n"
126 " if(extra_last_thread ==2) {\n"
127 " f[3*i+1] = vload_half(3*i+1, p);\n"
128 " }\n"
129 " f[3*i] = vload_half(3*i, p);\n"
130 " } else {\n"
131 " for (k=0; k<3; k++)\n"
132 " data[k] = i_p[i*3+k];\n"
133 " vstore3(vload_half3( 0, hdata_p ), i, f);\n"
134 " }\n"
135 "}\n"
136 };
137
138 const char *source_privateV3aligned[] = {
139 "__kernel void test( const __global half *p, __global float3 *f )\n"
140 "{\n"
141 " ushort4 data[4];\n" // declare as vector for alignment. Make four to check to see vloada_half3 index is working.
142 " half* hdata_p = (half*) &data;\n"
143 " size_t i = get_global_id(0);\n"
144 " global ushort* i_p = (global ushort*)p + i * 4;\n"
145 " int offset = i & 3;\n"
146 " data[offset] = (ushort4)( i_p[0], i_p[1], i_p[2], USHRT_MAX ); \n"
147 " data[offset^1] = USHRT_MAX; \n"
148 " data[offset^2] = USHRT_MAX; \n"
149 " data[offset^3] = USHRT_MAX; \n"
150 // test vloada_half3
151 " f[i] = vloada_half3( offset, hdata_p );\n"
152 // Fill in the 4th value so we don't have to special case this code elsewhere in the test.
153 " mem_fence(CLK_GLOBAL_MEM_FENCE );\n"
154 " ((__global float *)f)[4*i+3] = vload_half(4*i+3, p);\n"
155 "}\n"
156 };
157
158 char local_buf_size[10];
159
160 sprintf(local_buf_size, "%lld", (uint64_t)((effectiveVectorSize))*gWorkGroupSize);
161 const char *source_local1[] = {
162 "__kernel void test( const __global half *p, __global float *f )\n"
163 "{\n"
164 " __local ushort data[",local_buf_size,"];\n"
165 " __local half* hdata_p = (__local half*) data;\n"
166 " size_t i = get_global_id(0);\n"
167 " size_t lid = get_local_id(0);\n"
168 " data[lid] = ((__global ushort*)p)[i];\n"
169 " f[i] = vload", aligned ? "a" : "", "_half( lid, hdata_p );\n"
170 "}\n"
171 };
172
173 const char *source_local2[] = {
174 "#define VECTOR_LEN (",
175 vector_size_names[vectorSize],
176 "/",
177 align_divisors[vectorSize],
178 ")\n"
179 "#define ALIGN_TYPE ",
180 align_types[vectorSize],
181 "\n"
182 "__kernel void test( const __global half *p, __global float",
183 vector_size_name_extensions[vectorSize],
184 " *f )\n"
185 "{\n"
186 " __local uchar data[",
187 local_buf_size,
188 "/",
189 align_divisors[vectorSize],
190 "*sizeof(ALIGN_TYPE)] ",
191 "__attribute__((aligned(sizeof(ALIGN_TYPE))));\n"
192 " __local half* hdata_p = (__local half*) data;\n"
193 " __global ALIGN_TYPE* i_p = (__global ALIGN_TYPE*)p;\n"
194 " size_t i = get_global_id(0);\n"
195 " size_t lid = get_local_id(0);\n"
196 " int k;\n"
197 " for (k=0; k<VECTOR_LEN; k++)\n"
198 " *(__local ",
199 "ALIGN_TYPE*)&(data[(lid*VECTOR_LEN+k)*sizeof(ALIGN_TYPE)]) = ",
200 "i_p[i*VECTOR_LEN+k];\n"
201 " f[i] = vload",
202 aligned ? "a" : "",
203 "_half",
204 vector_size_name_extensions[vectorSize],
205 "( lid, hdata_p );\n"
206 "}\n"
207 };
208
209 const char *source_localV3[] = {
210 "__kernel void test( const __global half *p, __global float *f,\n"
211 " uint extra_last_thread)\n"
212 "{\n"
213 " __local ushort data[", local_buf_size,"];\n"
214 " __local half* hdata_p = (__local half*) data;\n"
215 " __global ushort* i_p = (__global ushort*)p;\n"
216 " size_t i = get_global_id(0);\n"
217 " size_t last_i = get_global_size(0)-1;\n"
218 " size_t lid = get_local_id(0);\n"
219 " int k;\n"
220 " if(last_i == i && extra_last_thread != 0) {\n"
221 " if(extra_last_thread ==2) {\n"
222 " f[3*i+1] = vload_half(3*i+1, p);\n"
223 " }\n"
224 " f[3*i] = vload_half(3*i, p);\n"
225 " } else {\n"
226 " for (k=0; k<3; k++)\n"
227 " data[lid*3+k] = i_p[i*3+k];\n"
228 " vstore3( vload_half3( lid, hdata_p ),i,f);\n"
229 " };\n"
230 "}\n"
231 };
232
233 const char *source_localV3aligned[] = {
234 "__kernel void test( const __global half *p, __global float3 *f )\n"
235 "{\n"
236 " __local ushort data[", local_buf_size,"];\n"
237 " __local half* hdata_p = (__local half*) data;\n"
238 " __global ushort* i_p = (__global ushort*)p;\n"
239 " size_t i = get_global_id(0);\n"
240 " size_t lid = get_local_id(0);\n"
241 " int k;\n"
242 " for (k=0; k<4; k++)\n"
243 " data[lid*4+k] = i_p[i*4+k];\n"
244 " f[i] = vloada_half3( lid, hdata_p );\n"
245 " ((__global float *)f)[4*i+3] = vload_half(lid*4+3, hdata_p);\n"
246 "}\n"
247 };
248
249 const char *source_constant[] = {
250 "__kernel void test( __constant half *p, __global float", vector_size_name_extensions[vectorSize], " *f )\n"
251 "{\n"
252 " size_t i = get_global_id(0);\n"
253 " f[i] = vload", aligned ? "a" : "", "_half",vector_size_name_extensions[vectorSize],"( i, p );\n"
254 "}\n"
255 };
256
257 const char *source_constantV3[] = {
258 "__kernel void test( __constant half *p, __global float *f,\n"
259 " uint extra_last_thread)\n"
260 "{\n"
261 " size_t i = get_global_id(0);\n"
262 " size_t last_i = get_global_size(0)-1;\n"
263 " if(last_i == i && extra_last_thread != 0) {\n"
264 " if(extra_last_thread ==2) {\n"
265 " f[3*i+1] = vload_half(3*i+1, p);\n"
266 " }\n"
267 " f[3*i] = vload_half(3*i, p);\n"
268 " } else {\n"
269 " vstore3(vload_half",vector_size_name_extensions[vectorSize],"( i, p ), i, f);\n"
270 " }\n"
271 "}\n"
272 };
273
274 const char *source_constantV3aligned[] = {
275 "__kernel void test( __constant half *p, __global float3 *f )\n"
276 "{\n"
277 " size_t i = get_global_id(0);\n"
278 " f[i] = vloada_half3( i, p );\n"
279 " ((__global float *)f)[4*i+3] = vload_half(4*i+3,p);\n"
280 "}\n"
281 };
282
283
284 if(g_arrVecSizes[vectorSize] != 3) {
285 programs[vectorSize][AS_Global] = MakeProgram( device, source, sizeof( source) / sizeof( source[0]) );
286 if( NULL == programs[ vectorSize ][AS_Global] ) {
287 gFailCount++;
288 vlog_error( "\t\tFAILED -- Failed to create program.\n" );
289 for ( q= 0; q < sizeof( source) / sizeof( source[0]); q++)
290 vlog_error("%s", source[q]);
291 return -1;
292 } else {
293 }
294 } else if(aligned) {
295 programs[vectorSize][AS_Global] = MakeProgram( device, sourceV3aligned, sizeof( sourceV3aligned) / sizeof( sourceV3aligned[0]) );
296 if( NULL == programs[ vectorSize ][AS_Global] ) {
297 gFailCount++;
298 vlog_error( "\t\tFAILED -- Failed to create program.\n" );
299 for ( q= 0; q < sizeof( sourceV3aligned) / sizeof( sourceV3aligned[0]); q++)
300 vlog_error("%s", sourceV3aligned[q]);
301 return -1;
302 } else {
303 }
304 } else {
305 programs[vectorSize][AS_Global] = MakeProgram( device, sourceV3, sizeof( sourceV3) / sizeof( sourceV3[0]) );
306 if( NULL == programs[ vectorSize ][AS_Global] ) {
307 gFailCount++;
308 vlog_error( "\t\tFAILED -- Failed to create program.\n" );
309 for ( q= 0; q < sizeof( sourceV3) / sizeof( sourceV3[0]); q++)
310 vlog_error("%s", sourceV3[q]);
311 return -1;
312 }
313 }
314
315 kernels[ vectorSize ][AS_Global] = clCreateKernel( programs[ vectorSize ][AS_Global], "test", &error );
316 if( NULL == kernels[vectorSize][AS_Global] )
317 {
318 gFailCount++;
319 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
320 return -2;
321 }
322
323 const char** source_ptr;
324 uint32_t source_size;
325 if (vectorSize == 0) {
326 source_ptr = source_private1;
327 source_size = sizeof( source_private1) / sizeof( source_private1[0]);
328 } else if(g_arrVecSizes[vectorSize] == 3) {
329 if(aligned) {
330 source_ptr = source_privateV3aligned;
331 source_size = sizeof( source_privateV3aligned) / sizeof( source_privateV3aligned[0]);
332 } else {
333 source_ptr = source_privateV3;
334 source_size = sizeof( source_privateV3) / sizeof( source_privateV3[0]);
335 }
336 } else {
337 source_ptr = source_private2;
338 source_size = sizeof( source_private2) / sizeof( source_private2[0]);
339 }
340 programs[vectorSize][AS_Private] = MakeProgram( device, source_ptr, source_size );
341 if( NULL == programs[ vectorSize ][AS_Private] )
342 {
343 gFailCount++;
344 vlog_error( "\t\tFAILED -- Failed to create private program.\n" );
345 for ( q= 0; q < source_size; q++)
346 vlog_error("%s", source_ptr[q]);
347 return -1;
348 }
349
350 kernels[ vectorSize ][AS_Private] = clCreateKernel( programs[ vectorSize ][AS_Private], "test", &error );
351 if( NULL == kernels[vectorSize][AS_Private] )
352 {
353 gFailCount++;
354 vlog_error( "\t\tFAILED -- Failed to create private kernel. (%d)\n", error );
355 return -2;
356 }
357
358 if (vectorSize == 0) {
359 source_ptr = source_local1;
360 source_size = sizeof( source_local1) / sizeof( source_local1[0]);
361 } else if(g_arrVecSizes[vectorSize] == 3) {
362 if(aligned) {
363 source_ptr = source_localV3aligned;
364 source_size = sizeof(source_localV3aligned)/sizeof(source_localV3aligned[0]);
365 } else {
366 source_ptr = source_localV3;
367 source_size = sizeof(source_localV3)/sizeof(source_localV3[0]);
368 }
369 } else {
370 source_ptr = source_local2;
371 source_size = sizeof( source_local2) / sizeof( source_local2[0]);
372 }
373 programs[vectorSize][AS_Local] = MakeProgram( device, source_ptr, source_size );
374 if( NULL == programs[ vectorSize ][AS_Local] )
375 {
376 gFailCount++;
377 vlog_error( "\t\tFAILED -- Failed to create local program.\n" );
378 for ( q= 0; q < source_size; q++)
379 vlog_error("%s", source_ptr[q]);
380 return -1;
381 }
382
383 kernels[ vectorSize ][AS_Local] = clCreateKernel( programs[ vectorSize ][AS_Local], "test", &error );
384 if( NULL == kernels[vectorSize][AS_Local] )
385 {
386 gFailCount++;
387 vlog_error( "\t\tFAILED -- Failed to create local kernel. (%d)\n", error );
388 return -2;
389 }
390
391 if(g_arrVecSizes[vectorSize] == 3) {
392 if(aligned) {
393 programs[vectorSize][AS_Constant] = MakeProgram( device, source_constantV3aligned, sizeof(source_constantV3aligned) / sizeof( source_constantV3aligned[0]) );
394 if( NULL == programs[ vectorSize ][AS_Constant] )
395 {
396 gFailCount++;
397 vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
398 for ( q= 0; q < sizeof( source_constantV3aligned) / sizeof( source_constantV3aligned[0]); q++)
399 vlog_error("%s", source_constantV3aligned[q]);
400 return -1;
401 }
402 } else {
403 programs[vectorSize][AS_Constant] = MakeProgram( device, source_constantV3, sizeof(source_constantV3) / sizeof( source_constantV3[0]) );
404 if( NULL == programs[ vectorSize ][AS_Constant] )
405 {
406 gFailCount++;
407 vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
408 for ( q= 0; q < sizeof( source_constantV3) / sizeof( source_constantV3[0]); q++)
409 vlog_error("%s", source_constantV3[q]);
410 return -1;
411 }
412 }
413 } else {
414 programs[vectorSize][AS_Constant] = MakeProgram( device, source_constant, sizeof(source_constant) / sizeof( source_constant[0]) );
415 if( NULL == programs[ vectorSize ][AS_Constant] )
416 {
417 gFailCount++;
418 vlog_error( "\t\tFAILED -- Failed to create constant program.\n" );
419 for ( q= 0; q < sizeof( source_constant) / sizeof( source_constant[0]); q++)
420 vlog_error("%s", source_constant[q]);
421 return -1;
422 }
423 }
424
425 kernels[ vectorSize ][AS_Constant] = clCreateKernel( programs[ vectorSize ][AS_Constant], "test", &error );
426 if( NULL == kernels[vectorSize][AS_Constant] )
427 {
428 gFailCount++;
429 vlog_error( "\t\tFAILED -- Failed to create constant kernel. (%d)\n", error );
430 return -2;
431 }
432 }
433
434 // Figure out how many elements are in a work block
435 size_t elementSize = std::max(sizeof(cl_half), sizeof(cl_float));
436 size_t blockCount = getBufferSize(device) / elementSize; // elementSize is power of 2
437 uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of things of size cl_half
438
439 // we handle 64-bit types a bit differently.
440 if( lastCase == 0 )
441 lastCase = 0x100000000ULL;
442
443
444 uint64_t i, j;
445 uint64_t printMask = (lastCase >> 4) - 1;
446 uint32_t count = 0;
447 error = 0;
448 int addressSpace;
449 // int reported_vector_skip = 0;
450
451 for( i = 0; i < (uint64_t)lastCase; i += blockCount )
452 {
453 count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
454
455 //Init the input stream
456 uint16_t *p = (uint16_t *)gIn_half;
457 for( j = 0; j < count; j++ )
458 p[j] = j + i;
459
460 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_half, CL_TRUE, 0, count * sizeof( cl_half ), gIn_half, 0, NULL, NULL)))
461 {
462 vlog_error( "Failure in clWriteArray\n" );
463 gFailCount++;
464 goto exit;
465 }
466
467 //create the reference result
468 const unsigned short *s = (const unsigned short *)gIn_half;
469 float *d = (float *)gOut_single_reference;
470 for (j = 0; j < count; j++) d[j] = cl_half_to_float(s[j]);
471
472 //Check the vector lengths
473 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
474 { // here we loop through vector sizes, 3 is last
475
476 for ( addressSpace = 0; addressSpace < AS_NumAddressSpaces; addressSpace++) {
477 uint32_t pattern = 0x7fffdead;
478
479 /*
480 if (addressSpace == 3) {
481 vlog("Note: skipping address space %s due to small buffer size.\n", addressSpaceNames[addressSpace]);
482 continue;
483 }
484 */
485 memset_pattern4( gOut_single, &pattern, getBufferSize(device));
486 if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_single, CL_TRUE, 0, count * sizeof( float ), gOut_single, 0, NULL, NULL)) )
487 {
488 vlog_error( "Failure in clWriteArray\n" );
489 gFailCount++;
490 goto exit;
491 }
492
493 if(g_arrVecSizes[vectorSize] == 3 && !aligned) {
494 // now we need to add the extra const argument for how
495 // many elements the last thread should take care of.
496 }
497
498 // okay, here is where we have to be careful
499 if( (error = RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_half, gOutBuffer_single, numVecs(count, vectorSize, aligned) ,
500 runsOverBy(count, vectorSize, aligned) ) ) )
501 {
502 gFailCount++;
503 goto exit;
504 }
505
506 if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_single, CL_TRUE, 0, count * sizeof( float ), gOut_single, 0, NULL, NULL)) )
507 {
508 vlog_error( "Failure in clReadArray\n" );
509 gFailCount++;
510 goto exit;
511 }
512
513 if( memcmp( gOut_single, gOut_single_reference, count * sizeof( float )) )
514 {
515 uint32_t *u1 = (uint32_t *)gOut_single;
516 uint32_t *u2 = (uint32_t *)gOut_single_reference;
517 float *f1 = (float *)gOut_single;
518 float *f2 = (float *)gOut_single_reference;
519 for( j = 0; j < count; j++ )
520 {
521 if(isnan(f1[j]) && isnan(f2[j])) // both are nan dont compare them
522 continue;
523 if( u1[j] != u2[j])
524 {
525 vlog_error( " %lld) (of %lld) Failure at 0x%4.4x: %a vs *%a (0x%8.8x vs *0x%8.8x) vector_size = %d (%s) address space = %s, load is %s\n",
526 j, (uint64_t)count, ((unsigned short*)gIn_half)[j], f1[j], f2[j], u1[j], u2[j], (g_arrVecSizes[vectorSize]),
527 vector_size_names[vectorSize], addressSpaceNames[addressSpace],
528 (aligned?"aligned":"unaligned"));
529 gFailCount++;
530 error = -1;
531 goto exit;
532 }
533 }
534 }
535
536 if( gReportTimes && addressSpace == 0)
537 {
538 //Run again for timing
539 for( j = 0; j < 100; j++ )
540 {
541 uint64_t startTime = ReadTime();
542 error =
543 RunKernel(device, kernels[vectorSize][addressSpace], gInBuffer_half, gOutBuffer_single, numVecs(count, vectorSize, aligned) ,
544 runsOverBy(count, vectorSize, aligned));
545 if(error)
546 {
547 gFailCount++;
548 goto exit;
549 }
550
551 if( (error = clFinish(gQueue)) )
552 {
553 vlog_error( "Failure in clFinish\n" );
554 gFailCount++;
555 goto exit;
556 }
557 uint64_t currentTime = ReadTime() - startTime;
558 time[ vectorSize ] += currentTime;
559 if( currentTime < min_time[ vectorSize ] )
560 min_time[ vectorSize ] = currentTime ;
561 }
562 }
563 }
564 }
565
566 if( ((i+blockCount) & ~printMask) == (i+blockCount) )
567 {
568 vlog( "." );
569 fflush( stdout );
570 }
571 }
572
573 vlog( "\n" );
574
575 if( gReportTimes )
576 {
577 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
578 vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * 100), 0,
579 "average us/elem", "vLoad%sHalf avg. (%s, vector size: %d)", ( (aligned) ? "a" : ""), addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
580 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
581 vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0,
582 "best us/elem", "vLoad%sHalf best (%s vector size: %d)", ( (aligned) ? "a" : ""), addressSpaceNames[0], (g_arrVecSizes[vectorSize]) );
583 }
584
585 exit:
586 //clean up
587 for( vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
588 {
589 for ( addressSpace = 0; addressSpace < AS_NumAddressSpaces; addressSpace++) {
590 clReleaseKernel( kernels[ vectorSize ][addressSpace] );
591 clReleaseProgram( programs[ vectorSize ][addressSpace] );
592 }
593 }
594
595 return error;
596 }
597
test_vload_half(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)598 int test_vload_half( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
599 {
600 return Test_vLoadHalf_private( device, false );
601 }
602
test_vloada_half(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)603 int test_vloada_half( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
604 {
605 return Test_vLoadHalf_private( device, true );
606 }
607
608