1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <string.h>
17
18 #include <algorithm>
19
20 #include "cl_utils.h"
21 #include "tests.h"
22 #include "harness/testHarness.h"
23
test_roundTrip(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)24 int test_roundTrip( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
25 {
26 int vectorSize, error;
27 uint64_t i, j;
28 cl_program programs[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
29 cl_kernel kernels[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
30 cl_program doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
31 cl_kernel doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
32 uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
33 uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
34 uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
35 uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
36 memset( min_time, -1, sizeof( min_time ) );
37 memset( min_double_time, -1, sizeof( min_double_time ) );
38
39 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
40 {
41 const char *source[] = {
42 "__kernel void test( const __global half *in, __global half *out )\n"
43 "{\n"
44 " size_t i = get_global_id(0);\n"
45 " vstore_half",vector_size_name_extensions[vectorSize],"( vload_half",vector_size_name_extensions[vectorSize],"(i, in), i, out);\n"
46 "}\n"
47 };
48
49 const char *doubleSource[] = {
50 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
51 "__kernel void test( const __global half *in, __global half *out )\n"
52 "{\n"
53 " size_t i = get_global_id(0);\n"
54 " vstore_half",vector_size_name_extensions[vectorSize],"( convert_double", vector_size_name_extensions[vectorSize], "( vload_half",vector_size_name_extensions[vectorSize],"(i, in)), i, out);\n"
55 "}\n"
56 };
57
58 const char *sourceV3[] = {
59 "__kernel void test( const __global half *in, __global half *out,"
60 " uint extra_last_thread )\n"
61 "{\n"
62 " size_t i = get_global_id(0);\n"
63 " size_t last_i = get_global_size(0)-1;\n"
64 " size_t adjust = 0;\n"
65 " if(i == last_i && extra_last_thread != 0) { \n"
66 " adjust = 3-extra_last_thread;\n"
67 " }\n"
68 " vstore_half3( vload_half3(i, in-adjust), i, out-adjust);\n"
69 "}\n"
70 };
71
72 const char *doubleSourceV3[] = {
73 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
74 "__kernel void test( const __global half *in, __global half *out,"
75 " uint extra_last_thread )\n"
76 "{\n"
77 " size_t i = get_global_id(0);\n"
78 " size_t last_i = get_global_size(0)-1;\n"
79 " size_t adjust = 0;\n"
80 " if(i == last_i && extra_last_thread != 0) { \n"
81 " adjust = 3-extra_last_thread;\n"
82 " }\n"
83 " vstore_half3( vload_half3(i, in-adjust), i, out-adjust);\n"
84 "}\n"
85 };
86
87 /*
88 const char *sourceV3aligned[] = {
89 "__kernel void test( const __global half *in, __global half *out )\n"
90 "{\n"
91 " size_t i = get_global_id(0);\n"
92 " vstorea_half3( vloada_half3(i, in), i, out);\n"
93 " vstore_half(vload_half(4*i+3, in), 4*i+3, out);\n"
94 "}\n"
95 };
96
97 const char *doubleSourceV3aligned[] = {
98 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
99 "__kernel void test( const __global half *in, __global half *out )\n"
100 "{\n"
101 " size_t i = get_global_id(0);\n"
102 " vstorea_half3( vloada_half3(i, in), i, out);\n"
103 " vstore_half(vload_half(4*i+3, in), 4*i+3, out);\n"
104 "}\n"
105 };
106 */
107
108 if(g_arrVecSizes[vectorSize] == 3) {
109 programs[vectorSize] = MakeProgram( device, sourceV3, sizeof( sourceV3) / sizeof( sourceV3[0]) );
110 if( NULL == programs[ vectorSize ] )
111 {
112 gFailCount++;
113
114 return -1;
115 }
116 } else {
117 programs[vectorSize] = MakeProgram( device, source, sizeof( source) / sizeof( source[0]) );
118 if( NULL == programs[ vectorSize ] )
119 {
120 gFailCount++;
121 return -1;
122 }
123 }
124
125 kernels[ vectorSize ] = clCreateKernel( programs[ vectorSize ], "test", &error );
126 if( NULL == kernels[vectorSize] )
127 {
128 gFailCount++;
129 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
130 return error;
131 }
132
133 if( gTestDouble )
134 {
135 if(g_arrVecSizes[vectorSize] == 3) {
136 doublePrograms[vectorSize] = MakeProgram( device, doubleSourceV3, sizeof( doubleSourceV3) / sizeof( doubleSourceV3[0]) );
137 if( NULL == doublePrograms[ vectorSize ] )
138 {
139 gFailCount++;
140 return -1;
141 }
142 } else {
143 doublePrograms[vectorSize] = MakeProgram( device, doubleSource, sizeof( doubleSource) / sizeof( doubleSource[0]) );
144 if( NULL == doublePrograms[ vectorSize ] )
145 {
146 gFailCount++;
147 return -1;
148 }
149 }
150
151 doubleKernels[ vectorSize ] = clCreateKernel( doublePrograms[ vectorSize ], "test", &error );
152 if( NULL == doubleKernels[vectorSize] )
153 {
154 gFailCount++;
155 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
156 return error;
157 }
158 }
159 }
160
161 // Figure out how many elements are in a work block
162 size_t elementSize = std::max(sizeof(cl_half), sizeof(cl_float));
163 size_t blockCount = (size_t)getBufferSize(device) / elementSize; //elementSize is a power of two
164 uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of cl_half
165 size_t stride = blockCount;
166
167 error = 0;
168 uint64_t printMask = (lastCase >> 4) - 1;
169 uint32_t count;
170 size_t loopCount;
171
172 for( i = 0; i < (uint64_t)lastCase; i += stride )
173 {
174 count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
175
176 //Init the input stream
177 uint16_t *p = (uint16_t *)gIn_half;
178 for( j = 0; j < count; j++ )
179 p[j] = j + i;
180
181 if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_half, CL_TRUE, 0, count * sizeof( cl_half ), gIn_half, 0, NULL, NULL)) )
182 {
183 vlog_error( "Failure in clWriteArray\n" );
184 gFailCount++;
185 goto exit;
186 }
187
188 //Check the vector lengths
189 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
190 { // here we loop through vector sizes -- 3 is last.
191 uint32_t pattern = 0xdeaddead;
192 memset_pattern4( gOut_half, &pattern, (size_t)getBufferSize(device)/2);
193
194 if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
195 {
196 vlog_error( "Failure in clWriteArray\n" );
197 gFailCount++;
198 goto exit;
199 }
200
201
202 // here is where "3" starts to cause problems.
203 error = RunKernel(device, kernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
204 runsOverBy(count, vectorSize, false) );
205 if(error)
206 {
207 gFailCount++;
208 goto exit;
209 }
210
211 if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
212 {
213 vlog_error( "Failure in clReadArray\n" );
214 gFailCount++;
215 goto exit;
216 }
217
218 if( (memcmp( gOut_half, gIn_half, count * sizeof(cl_half))) )
219 {
220 uint16_t *u1 = (uint16_t *)gOut_half;
221 uint16_t *u2 = (uint16_t *)gIn_half;
222 for( j = 0; j < count; j++ )
223 {
224 if( u1[j] != u2[j] )
225 {
226 uint16_t abs1 = u1[j] & 0x7fff;
227 uint16_t abs2 = u2[j] & 0x7fff;
228 if( abs1 > 0x7c00 && abs2 > 0x7c00 )
229 continue; //any NaN is okay if NaN is input
230
231 // if reference result is sub normal, test if the output is flushed to zero
232 if( IsHalfSubnormal(u2[j]) && ( (u1[j] == 0) || (u1[j] == 0x8000) ) )
233 continue;
234
235 vlog_error( "%lld) (of %lld) Failure at 0x%4.4x: 0x%4.4x vector_size = %d \n", j, (uint64_t)count, u2[j], u1[j], (g_arrVecSizes[vectorSize]) );
236 gFailCount++;
237 error = -1;
238 goto exit;
239 }
240 }
241 }
242
243 if( gTestDouble )
244 {
245 memset_pattern4( gOut_half, &pattern, (size_t)getBufferSize(device)/2);
246 if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
247 {
248 vlog_error( "Failure in clWriteArray\n" );
249 gFailCount++;
250 goto exit;
251 }
252
253
254 if( (error = RunKernel(device, doubleKernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
255 runsOverBy(count, vectorSize, false) ) ) )
256 {
257 gFailCount++;
258 goto exit;
259 }
260
261 if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
262 {
263 vlog_error( "Failure in clReadArray\n" );
264 gFailCount++;
265 goto exit;
266 }
267
268 if( (memcmp( gOut_half, gIn_half, count * sizeof(cl_half))) )
269 {
270 uint16_t *u1 = (uint16_t *)gOut_half;
271 uint16_t *u2 = (uint16_t *)gIn_half;
272 for( j = 0; j < count; j++ )
273 {
274 if( u1[j] != u2[j] )
275 {
276 uint16_t abs1 = u1[j] & 0x7fff;
277 uint16_t abs2 = u2[j] & 0x7fff;
278 if( abs1 > 0x7c00 && abs2 > 0x7c00 )
279 continue; //any NaN is okay if NaN is input
280
281 // if reference result is sub normal, test if the output is flushed to zero
282 if( IsHalfSubnormal(u2[j]) && ( (u1[j] == 0) || (u1[j] == 0x8000) ) )
283 continue;
284
285 vlog_error( "%lld) Failure at 0x%4.4x: 0x%4.4x vector_size = %d (double precsion)\n", j, u2[j], u1[j], (g_arrVecSizes[vectorSize]) );
286 gFailCount++;
287 error = -1;
288 goto exit;
289 }
290 }
291 }
292 }
293 }
294
295 if( ((i+blockCount) & ~printMask) == (i+blockCount) )
296 {
297 vlog( "." );
298 fflush( stdout );
299 }
300 }
301
302 vlog( "\n" );
303
304 loopCount = 100;
305 if( gReportTimes )
306 {
307 //Run again for timing
308 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
309 {
310 uint64_t bestTime = -1ULL;
311
312 for( j = 0; j < loopCount; j++ )
313 {
314 uint64_t startTime = ReadTime();
315 if( (error = RunKernel(device, kernels[vectorSize], gInBuffer_half, gOutBuffer_half,numVecs(count, vectorSize, false) ,
316 runsOverBy(count, vectorSize, false)) ) )
317 {
318 gFailCount++;
319 goto exit;
320 }
321
322 if( (error = clFinish(gQueue)) )
323 {
324 vlog_error( "Failure in clFinish\n" );
325 gFailCount++;
326 goto exit;
327 }
328 uint64_t currentTime = ReadTime() - startTime;
329 if( currentTime < bestTime )
330 bestTime = currentTime;
331 time[ vectorSize ] += currentTime;
332 }
333 if( bestTime < min_time[ vectorSize ] )
334 min_time[ vectorSize ] = bestTime;
335
336 if( gTestDouble )
337 {
338 bestTime = -1ULL;
339 for( j = 0; j < loopCount; j++ )
340 {
341 uint64_t startTime = ReadTime();
342 if( (error = RunKernel(device, doubleKernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
343 runsOverBy(count, vectorSize, false)) ) )
344 {
345 gFailCount++;
346 goto exit;
347 }
348
349 if( (error = clFinish(gQueue)) )
350 {
351 vlog_error( "Failure in clFinish\n" );
352 gFailCount++;
353 goto exit;
354 }
355 uint64_t currentTime = ReadTime() - startTime;
356 if( currentTime < bestTime )
357 bestTime = currentTime;
358 doubleTime[ vectorSize ] += currentTime;
359 }
360 if( bestTime < min_double_time[ vectorSize ] )
361 min_double_time[ vectorSize ] = bestTime;
362 }
363 }
364 }
365
366 if( gReportTimes )
367 {
368 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
369 vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, "average us/elem", "roundTrip avg. (vector size: %d)", (g_arrVecSizes[vectorSize]) );
370 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
371 vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, "best us/elem", "roundTrip best (vector size: %d)", (g_arrVecSizes[vectorSize]) );
372 if( gTestDouble )
373 {
374 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
375 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, "average us/elem (double)", "roundTrip avg. d (vector size: %d)", (g_arrVecSizes[vectorSize]) );
376 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
377 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, "best us/elem (double)", "roundTrip best d (vector size: %d)", (g_arrVecSizes[vectorSize]) );
378 }
379 }
380
381 exit:
382 //clean up
383 for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
384 {
385 clReleaseKernel( kernels[ vectorSize ] );
386 clReleaseProgram( programs[ vectorSize ] );
387 if( gTestDouble )
388 {
389 clReleaseKernel( doubleKernels[ vectorSize ] );
390 clReleaseProgram( doublePrograms[ vectorSize ] );
391 }
392 }
393
394 return error;
395 }
396
397
398