xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/half/Test_roundTrip.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <string.h>
17 
18 #include <algorithm>
19 
20 #include "cl_utils.h"
21 #include "tests.h"
22 #include "harness/testHarness.h"
23 
test_roundTrip(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)24 int test_roundTrip( cl_device_id device, cl_context context, cl_command_queue queue, int num_elements )
25 {
26     int vectorSize, error;
27     uint64_t i, j;
28     cl_program  programs[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
29     cl_kernel   kernels[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
30     cl_program  doublePrograms[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
31     cl_kernel   doubleKernels[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
32     uint64_t time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
33     uint64_t min_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
34     uint64_t doubleTime[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
35     uint64_t min_double_time[kVectorSizeCount+kStrangeVectorSizeCount] = {0};
36     memset( min_time, -1, sizeof( min_time ) );
37     memset( min_double_time, -1, sizeof( min_double_time ) );
38 
39     for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
40     {
41         const char *source[] = {
42             "__kernel void test( const __global half *in, __global half *out )\n"
43             "{\n"
44             "   size_t i = get_global_id(0);\n"
45             "   vstore_half",vector_size_name_extensions[vectorSize],"( vload_half",vector_size_name_extensions[vectorSize],"(i, in),  i, out);\n"
46             "}\n"
47         };
48 
49         const char *doubleSource[] = {
50             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
51             "__kernel void test( const __global half *in, __global half *out )\n"
52             "{\n"
53             "   size_t i = get_global_id(0);\n"
54             "   vstore_half",vector_size_name_extensions[vectorSize],"( convert_double", vector_size_name_extensions[vectorSize], "( vload_half",vector_size_name_extensions[vectorSize],"(i, in)),  i, out);\n"
55             "}\n"
56         };
57 
58         const char *sourceV3[] = {
59             "__kernel void test( const __global half *in, __global half *out,"
60             "                    uint extra_last_thread  )\n"
61             "{\n"
62             "   size_t i = get_global_id(0);\n"
63             "   size_t last_i = get_global_size(0)-1;\n"
64             "   size_t adjust = 0;\n"
65             "   if(i == last_i && extra_last_thread != 0) { \n"
66             "     adjust = 3-extra_last_thread;\n"
67             "   }\n"
68             "   vstore_half3( vload_half3(i, in-adjust),  i, out-adjust);\n"
69             "}\n"
70         };
71 
72         const char *doubleSourceV3[] = {
73             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
74             "__kernel void test( const __global half *in, __global half *out,"
75             "                    uint extra_last_thread  )\n"
76             "{\n"
77             "   size_t i = get_global_id(0);\n"
78             "   size_t last_i = get_global_size(0)-1;\n"
79             "   size_t adjust = 0;\n"
80             "   if(i == last_i && extra_last_thread != 0) { \n"
81             "     adjust = 3-extra_last_thread;\n"
82             "   }\n"
83             "   vstore_half3( vload_half3(i, in-adjust),  i, out-adjust);\n"
84             "}\n"
85         };
86 
87 /*
88         const char *sourceV3aligned[] = {
89             "__kernel void test( const __global half *in, __global half *out )\n"
90             "{\n"
91             "   size_t i = get_global_id(0);\n"
92             "   vstorea_half3( vloada_half3(i, in),  i, out);\n"
93             "   vstore_half(vload_half(4*i+3, in), 4*i+3, out);\n"
94             "}\n"
95         };
96 
97         const char *doubleSourceV3aligned[] = {
98             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
99             "__kernel void test( const __global half *in, __global half *out )\n"
100             "{\n"
101             "   size_t i = get_global_id(0);\n"
102             "   vstorea_half3( vloada_half3(i, in),  i, out);\n"
103             "   vstore_half(vload_half(4*i+3, in), 4*i+3, out);\n"
104             "}\n"
105         };
106 */
107 
108         if(g_arrVecSizes[vectorSize] == 3) {
109             programs[vectorSize] = MakeProgram( device, sourceV3, sizeof( sourceV3) / sizeof( sourceV3[0])  );
110             if( NULL == programs[ vectorSize ] )
111             {
112                 gFailCount++;
113 
114                 return -1;
115             }
116         } else {
117             programs[vectorSize] = MakeProgram( device, source, sizeof( source) / sizeof( source[0])  );
118             if( NULL == programs[ vectorSize ] )
119             {
120                 gFailCount++;
121                 return -1;
122             }
123         }
124 
125         kernels[ vectorSize ] = clCreateKernel( programs[ vectorSize ], "test", &error );
126         if( NULL == kernels[vectorSize] )
127         {
128             gFailCount++;
129             vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
130             return error;
131         }
132 
133         if( gTestDouble )
134         {
135             if(g_arrVecSizes[vectorSize] == 3) {
136                 doublePrograms[vectorSize] = MakeProgram( device, doubleSourceV3, sizeof( doubleSourceV3) / sizeof( doubleSourceV3[0])  );
137                 if( NULL == doublePrograms[ vectorSize ] )
138                 {
139                     gFailCount++;
140                     return -1;
141                 }
142             } else {
143                 doublePrograms[vectorSize] = MakeProgram( device, doubleSource, sizeof( doubleSource) / sizeof( doubleSource[0])  );
144                 if( NULL == doublePrograms[ vectorSize ] )
145                 {
146                     gFailCount++;
147                     return -1;
148                 }
149             }
150 
151             doubleKernels[ vectorSize ] = clCreateKernel( doublePrograms[ vectorSize ], "test", &error );
152             if( NULL == doubleKernels[vectorSize] )
153             {
154                 gFailCount++;
155                 vlog_error( "\t\tFAILED -- Failed to create kernel. (%d)\n", error );
156                 return error;
157             }
158         }
159     }
160 
161     // Figure out how many elements are in a work block
162     size_t elementSize = std::max(sizeof(cl_half), sizeof(cl_float));
163     size_t blockCount = (size_t)getBufferSize(device) / elementSize; //elementSize is a power of two
164     uint64_t lastCase = 1ULL << (8*sizeof(cl_half)); // number of cl_half
165     size_t stride = blockCount;
166 
167     error = 0;
168     uint64_t printMask = (lastCase >> 4) - 1;
169     uint32_t count;
170     size_t loopCount;
171 
172     for( i = 0; i < (uint64_t)lastCase; i += stride )
173     {
174         count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
175 
176         //Init the input stream
177         uint16_t *p = (uint16_t *)gIn_half;
178         for( j = 0; j < count; j++ )
179             p[j] = j + i;
180 
181         if( (error = clEnqueueWriteBuffer(gQueue, gInBuffer_half, CL_TRUE, 0, count * sizeof( cl_half ), gIn_half, 0, NULL, NULL)) )
182         {
183             vlog_error( "Failure in clWriteArray\n" );
184             gFailCount++;
185             goto exit;
186         }
187 
188         //Check the vector lengths
189         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
190         { // here we loop through vector sizes -- 3 is last.
191             uint32_t pattern = 0xdeaddead;
192             memset_pattern4( gOut_half, &pattern, (size_t)getBufferSize(device)/2);
193 
194             if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
195             {
196                 vlog_error( "Failure in clWriteArray\n" );
197                 gFailCount++;
198                 goto exit;
199             }
200 
201 
202             // here is where "3" starts to cause problems.
203             error = RunKernel(device, kernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
204                               runsOverBy(count, vectorSize, false) );
205             if(error)
206             {
207                 gFailCount++;
208                 goto exit;
209             }
210 
211             if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
212             {
213                 vlog_error( "Failure in clReadArray\n" );
214                 gFailCount++;
215                 goto exit;
216             }
217 
218             if( (memcmp( gOut_half, gIn_half, count * sizeof(cl_half))) )
219             {
220                 uint16_t *u1 = (uint16_t *)gOut_half;
221                 uint16_t *u2 = (uint16_t *)gIn_half;
222                 for( j = 0; j < count; j++ )
223                 {
224                     if( u1[j] != u2[j] )
225                     {
226                         uint16_t abs1 = u1[j] & 0x7fff;
227                         uint16_t abs2 = u2[j] & 0x7fff;
228                         if( abs1 > 0x7c00 && abs2 > 0x7c00 )
229                             continue; //any NaN is okay if NaN is input
230 
231                         // if reference result is sub normal, test if the output is flushed to zero
232                         if( IsHalfSubnormal(u2[j]) && ( (u1[j] == 0) || (u1[j] == 0x8000) ) )
233                             continue;
234 
235                         vlog_error( "%lld) (of %lld)  Failure at 0x%4.4x:  0x%4.4x   vector_size = %d \n", j, (uint64_t)count, u2[j], u1[j], (g_arrVecSizes[vectorSize]) );
236                         gFailCount++;
237                         error = -1;
238                         goto exit;
239                     }
240                 }
241             }
242 
243             if( gTestDouble )
244             {
245                 memset_pattern4( gOut_half, &pattern, (size_t)getBufferSize(device)/2);
246                 if( (error = clEnqueueWriteBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
247                 {
248                     vlog_error( "Failure in clWriteArray\n" );
249                     gFailCount++;
250                     goto exit;
251                 }
252 
253 
254                 if( (error = RunKernel(device, doubleKernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
255                                        runsOverBy(count, vectorSize, false) ) ) )
256                 {
257                     gFailCount++;
258                     goto exit;
259                 }
260 
261                 if( (error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0, count * sizeof(cl_half), gOut_half, 0, NULL, NULL)) )
262                 {
263                     vlog_error( "Failure in clReadArray\n" );
264                     gFailCount++;
265                     goto exit;
266                 }
267 
268                 if( (memcmp( gOut_half, gIn_half, count * sizeof(cl_half))) )
269                 {
270                     uint16_t *u1 = (uint16_t *)gOut_half;
271                     uint16_t *u2 = (uint16_t *)gIn_half;
272                     for( j = 0; j < count; j++ )
273                     {
274                         if( u1[j] != u2[j] )
275                         {
276                             uint16_t abs1 = u1[j] & 0x7fff;
277                             uint16_t abs2 = u2[j] & 0x7fff;
278                             if( abs1 > 0x7c00 && abs2 > 0x7c00 )
279                                 continue; //any NaN is okay if NaN is input
280 
281                             // if reference result is sub normal, test if the output is flushed to zero
282                             if( IsHalfSubnormal(u2[j]) && ( (u1[j] == 0) || (u1[j] == 0x8000) ) )
283                                 continue;
284 
285                             vlog_error( "%lld) Failure at 0x%4.4x:  0x%4.4x   vector_size = %d (double precsion)\n", j, u2[j], u1[j], (g_arrVecSizes[vectorSize]) );
286                             gFailCount++;
287                             error = -1;
288                             goto exit;
289                         }
290                     }
291                 }
292             }
293         }
294 
295         if( ((i+blockCount) & ~printMask) == (i+blockCount) )
296         {
297             vlog( "." );
298             fflush( stdout );
299         }
300     }
301 
302     vlog( "\n" );
303 
304     loopCount = 100;
305     if( gReportTimes )
306     {
307         //Run again for timing
308         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
309         {
310             uint64_t bestTime = -1ULL;
311 
312             for( j = 0; j < loopCount; j++ )
313             {
314                 uint64_t startTime = ReadTime();
315                 if( (error = RunKernel(device, kernels[vectorSize], gInBuffer_half, gOutBuffer_half,numVecs(count, vectorSize, false) ,
316                                        runsOverBy(count, vectorSize, false)) ) )
317                 {
318                     gFailCount++;
319                     goto exit;
320                 }
321 
322                 if( (error = clFinish(gQueue)) )
323                 {
324                     vlog_error( "Failure in clFinish\n" );
325                     gFailCount++;
326                     goto exit;
327                 }
328                 uint64_t currentTime = ReadTime() - startTime;
329                 if( currentTime < bestTime )
330                     bestTime = currentTime;
331                 time[ vectorSize ] += currentTime;
332             }
333             if( bestTime < min_time[ vectorSize ] )
334                 min_time[ vectorSize ] = bestTime;
335 
336             if( gTestDouble )
337             {
338                 bestTime = -1ULL;
339                 for( j = 0; j < loopCount; j++ )
340                 {
341                     uint64_t startTime = ReadTime();
342                     if( (error = RunKernel(device, doubleKernels[vectorSize], gInBuffer_half, gOutBuffer_half, numVecs(count, vectorSize, false) ,
343                                            runsOverBy(count, vectorSize, false)) ) )
344                     {
345                         gFailCount++;
346                         goto exit;
347                     }
348 
349                     if( (error = clFinish(gQueue)) )
350                     {
351                         vlog_error( "Failure in clFinish\n" );
352                         gFailCount++;
353                         goto exit;
354                     }
355                     uint64_t currentTime = ReadTime() - startTime;
356                     if( currentTime < bestTime )
357                         bestTime = currentTime;
358                     doubleTime[ vectorSize ] += currentTime;
359                 }
360                 if( bestTime < min_double_time[ vectorSize ] )
361                     min_double_time[ vectorSize ] = bestTime;
362             }
363         }
364     }
365 
366     if( gReportTimes )
367     {
368         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
369             vlog_perf( SubtractTime( time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, "average us/elem", "roundTrip avg. (vector size: %d)", (g_arrVecSizes[vectorSize]) );
370         for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
371             vlog_perf( SubtractTime( min_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, "best us/elem", "roundTrip best (vector size: %d)", (g_arrVecSizes[vectorSize])  );
372         if( gTestDouble )
373         {
374             for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
375                 vlog_perf( SubtractTime( doubleTime[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) (count * loopCount), 0, "average us/elem (double)", "roundTrip avg. d (vector size: %d)", (g_arrVecSizes[vectorSize])  );
376             for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
377                 vlog_perf( SubtractTime( min_double_time[ vectorSize ], 0 ) * 1e6 * gDeviceFrequency * gComputeDevices / (double) count, 0, "best us/elem (double)", "roundTrip best d (vector size: %d)", (g_arrVecSizes[vectorSize]) );
378         }
379     }
380 
381 exit:
382     //clean up
383     for( vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest; vectorSize++)
384     {
385         clReleaseKernel( kernels[ vectorSize ] );
386         clReleaseProgram( programs[ vectorSize ] );
387         if( gTestDouble )
388         {
389             clReleaseKernel( doubleKernels[ vectorSize ] );
390             clReleaseProgram( doublePrograms[ vectorSize ] );
391         }
392     }
393 
394     return error;
395 }
396 
397 
398