1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <algorithm>
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #include <limits.h>
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #include <vector>
24
25 #include <CL/cl_half.h>
26
27 #include "procs.h"
28 #include "harness/conversions.h"
29 #include "harness/errorHelpers.h"
30 #include "harness/stringHelpers.h"
31 #include "harness/typeWrappers.h"
32
33 // Outputs debug information for stores
34 #define DEBUG 0
35 // Forces stores/loads to be done with offsets = tid
36 #define LINEAR_OFFSETS 0
37 #define NUM_LOADS 512
38 #define HFF(num) cl_half_from_float(num, halfRoundingMode)
39 #define HTF(num) cl_half_to_float(num)
40
41 char pragma_str[128] = { 0 };
42 char mem_type[64] = { 0 };
43 char store_str[128] = { 0 };
44 char load_str[128] = { 0 };
45
46 extern cl_half_rounding_mode halfRoundingMode;
47
48 // clang-format off
49 static const char *store_pattern= "results[ tid ] = tmp;\n";
50 static const char *store_patternV3 = "results[3*tid] = tmp.s0; results[3*tid+1] = tmp.s1; results[3*tid+2] = tmp.s2;\n";
51 static const char *load_pattern = "sSharedStorage[ i ] = src[ i ];\n";
52 static const char *load_patternV3 = "sSharedStorage[3*i] = src[ 3*i]; sSharedStorage[3*i+1] = src[3*i+1]; sSharedStorage[3*i+2] = src[3*i+2];\n";
53 static const char *kernel_pattern[] = {
54 pragma_str,
55 "#define STYPE %s\n"
56 "__kernel void test_fn( ", mem_type, " STYPE *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
57 "{\n"
58 " int tid = get_global_id( 0 );\n"
59 " %s%d tmp = vload%d( offsets[ tid ], ( (", mem_type, " STYPE *) src ) + alignmentOffsets[ tid ] );\n"
60 " ", store_str,
61 "}\n"
62 };
63
64 const char *pattern_local [] = {
65 pragma_str,
66 "__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
67 "{\n"
68 " int tid = get_global_id( 0 );\n"
69 " int lid = get_local_id( 0 );\n"
70 "\n"
71 " if( lid == 0 )\n"
72 " {\n"
73 " for( int i = 0; i < %d; i++ ) {\n"
74 " ", load_str,
75 " }\n"
76 " }\n"
77 // Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
78 // threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
79 // updated on all threads at that point
80 " barrier( CLK_LOCAL_MEM_FENCE );\n"
81 "\n"
82 " %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
83 " ", store_str,
84 "}\n" };
85
86 const char *pattern_priv [] = {
87 pragma_str,
88 // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
89 // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
90 "#define PRIV_TYPE %s\n"
91 "#define PRIV_SIZE %d\n"
92 "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
93 "{\n"
94 " __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
95 " int tid = get_global_id( 0 );\n"
96 "\n"
97 " for( int i = 0; i < PRIV_SIZE; i++ )\n"
98 " sPrivateStorage[ i ] = src[ i ];\n"
99 // Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
100 // anybody else to sync up
101 "\n"
102 " %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
103 " ", store_str,
104 "}\n"};
105 // clang-format on
106
107 #pragma mark -------------------- vload harness --------------------------
108
109 typedef void (*create_program_fn)(std::string &, size_t, ExplicitType, size_t,
110 size_t);
111 typedef int (*test_fn)(cl_device_id, cl_context, cl_command_queue, ExplicitType,
112 unsigned int, create_program_fn, size_t);
113
test_vload(cl_device_id device,cl_context context,cl_command_queue queue,ExplicitType type,unsigned int vecSize,create_program_fn createFn,size_t bufferSize)114 int test_vload(cl_device_id device, cl_context context, cl_command_queue queue,
115 ExplicitType type, unsigned int vecSize,
116 create_program_fn createFn, size_t bufferSize)
117 {
118 clProgramWrapper program;
119 clKernelWrapper kernel;
120 clMemWrapper streams[ 4 ];
121 MTdataHolder d(gRandomSeed);
122 const size_t numLoads = (DEBUG) ? 16 : NUM_LOADS;
123
124 if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128;
125
126 size_t threads[ 1 ], localThreads[ 1 ];
127 clProtectedArray inBuffer( bufferSize );
128 cl_uint offsets[ numLoads ], alignmentOffsets[ numLoads ];
129 size_t numElements, typeSize, i;
130 unsigned int outVectorSize;
131
132 pragma_str[0] = '\0';
133 if (type == kDouble)
134 std::snprintf(pragma_str, sizeof(pragma_str),
135 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
136 else if (type == kHalf)
137 std::snprintf(pragma_str, sizeof(pragma_str),
138 "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
139
140 typeSize = get_explicit_type_size( type );
141 numElements = bufferSize / ( typeSize * vecSize );
142 bufferSize = numElements * typeSize * vecSize; // To account for rounding
143
144 if (DEBUG) log_info("Testing: numLoads: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numLoads, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
145
146 // Create some random input data and random offsets to load from
147 generate_random_data( type, numElements * vecSize, d, (void *)inBuffer );
148 for( i = 0; i < numLoads; i++ )
149 {
150 offsets[ i ] = (cl_uint)random_in_range( 0, (int)numElements - 1, d );
151 if( offsets[ i ] < numElements - 2 )
152 alignmentOffsets[ i ] = (cl_uint)random_in_range( 0, (int)vecSize - 1, d );
153 else
154 alignmentOffsets[ i ] = 0;
155 if (LINEAR_OFFSETS) offsets[i] = (cl_uint)i;
156 }
157 if (LINEAR_OFFSETS) log_info("Offsets set to thread IDs to simplify output.\n");
158
159 // 32-bit fixup
160 outVectorSize = vecSize;
161
162 // Declare output buffers now
163 std::vector<char> outBuffer(numLoads * typeSize * outVectorSize);
164 std::vector<char> referenceBuffer(numLoads * typeSize * vecSize);
165
166 // Create the program
167 std::string programSrc;
168 createFn( programSrc, numElements, type, vecSize, outVectorSize);
169
170 // Create our kernel
171 const char *ptr = programSrc.c_str();
172 cl_int error = create_single_kernel_helper(context, &program, &kernel, 1,
173 &ptr, "test_fn");
174 test_error( error, "Unable to create testing kernel" );
175 if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str());
176
177 // Get the number of args to differentiate the kernels with local storage. (They have 5)
178 cl_uint numArgs;
179 error = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(numArgs), &numArgs, NULL);
180 test_error( error, "clGetKernelInfo failed");
181
182 // Set up parameters
183 streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, bufferSize, (void *)inBuffer, &error );
184 test_error( error, "Unable to create kernel stream" );
185 streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(offsets[0]), offsets, &error );
186 test_error( error, "Unable to create kernel stream" );
187 streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(alignmentOffsets[0]), alignmentOffsets, &error );
188 test_error( error, "Unable to create kernel stream" );
189 streams[3] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
190 numLoads * typeSize * outVectorSize,
191 (void *)outBuffer.data(), &error);
192 test_error( error, "Unable to create kernel stream" );
193
194 // Set parameters and run
195 if (numArgs == 5) {
196 // We need to set the size of the local storage
197 error = clSetKernelArg(kernel, 0, bufferSize, NULL);
198 test_error( error, "clSetKernelArg for buffer failed");
199 for( i = 0; i < 4; i++ )
200 {
201 error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
202 test_error( error, "Unable to set kernel argument" );
203 }
204 } else {
205 // No local storage
206 for( i = 0; i < 4; i++ )
207 {
208 error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
209 test_error( error, "Unable to set kernel argument" );
210 }
211 }
212
213 threads[ 0 ] = numLoads;
214 error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] );
215 test_error( error, "Unable to get local thread size" );
216
217 error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
218 test_error( error, "Unable to exec kernel" );
219
220 // Get the results
221 error = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0,
222 numLoads * typeSize * outVectorSize
223 * sizeof(cl_char),
224 (void *)outBuffer.data(), 0, NULL, NULL);
225 test_error( error, "Unable to read results" );
226
227 // Create the reference results
228 referenceBuffer.assign(numLoads * typeSize * vecSize, 0);
229 for( i = 0; i < numLoads; i++ )
230 {
231 memcpy(&referenceBuffer[i * typeSize * vecSize],
232 ((char *)(void *)inBuffer)
233 + ((offsets[i] * vecSize) + alignmentOffsets[i]) * typeSize,
234 typeSize * vecSize);
235 }
236
237 // Validate the results now
238 char *expected = referenceBuffer.data();
239 char *actual = outBuffer.data();
240 char *in = (char *)(void *)inBuffer;
241
242 if (DEBUG) {
243 log_info("Memory contents:\n");
244 char inString[1024];
245 char expectedString[1024], actualString[1024];
246 for (i=0; i<numElements; i++) {
247 if (i < numLoads) {
248 log_info("buffer %3d: input: %s expected: %s got: %s (load offset %3d, alignment offset %3d)", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
249 GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
250 GetDataVectorString( &(actual[i*typeSize*outVectorSize]), typeSize, vecSize, actualString ),
251 offsets[i], alignmentOffsets[i]);
252 if (memcmp(&(expected[i*typeSize*vecSize]), &(actual[i*typeSize*outVectorSize]), typeSize * vecSize) != 0)
253 log_error(" << ERROR\n");
254 else
255 log_info("\n");
256 } else {
257 log_info("buffer %3d: input: %s expected: %s got: %s\n", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
258 GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
259 GetDataVectorString( &(actual[i*typeSize*outVectorSize]), typeSize, vecSize, actualString ));
260 }
261 }
262 }
263
264 for( i = 0; i < numLoads; i++ )
265 {
266 if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
267 {
268 char expectedString[ 1024 ], actualString[ 1024 ];
269 log_error( "ERROR: Data sample %d for vload of %s%d did not validate (expected {%s}, got {%s}, loaded from offset %d)\n",
270 (int)i, get_explicit_type_name( type ), vecSize, GetDataVectorString( expected, typeSize, vecSize, expectedString ),
271 GetDataVectorString( actual, typeSize, vecSize, actualString ), (int)offsets[ i ] );
272 return 1;
273 }
274 expected += typeSize * vecSize;
275 actual += typeSize * outVectorSize;
276 }
277 return 0;
278 }
279
280 template <test_fn test_func_ptr>
test_vset(cl_device_id device,cl_context context,cl_command_queue queue,create_program_fn createFn,size_t bufferSize)281 int test_vset(cl_device_id device, cl_context context, cl_command_queue queue,
282 create_program_fn createFn, size_t bufferSize)
283 {
284 std::vector<ExplicitType> vecType = { kChar, kUChar, kShort, kUShort,
285 kInt, kUInt, kLong, kULong,
286 kFloat, kHalf, kDouble };
287 unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
288 const char *size_names[] = { "2", "3", "4", "8", "16"};
289 int error = 0;
290
291 log_info("Testing with buffer size of %d.\n", (int)bufferSize);
292
293 bool hasDouble = is_extension_available(device, "cl_khr_fp64");
294 bool hasHalf = is_extension_available(device, "cl_khr_fp16");
295
296 for (unsigned typeIdx = 0; typeIdx < vecType.size(); typeIdx++)
297 {
298 if (vecType[typeIdx] == kDouble && !hasDouble)
299 continue;
300 else if (vecType[typeIdx] == kHalf && !hasHalf)
301 continue;
302 else if ((vecType[typeIdx] == kLong || vecType[typeIdx] == kULong)
303 && !gHasLong)
304 continue;
305
306 for (unsigned sizeIdx = 0; vecSizes[sizeIdx] != 0; sizeIdx++)
307 {
308 log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
309
310 int error_this_type =
311 test_func_ptr(device, context, queue, vecType[typeIdx],
312 vecSizes[sizeIdx], createFn, bufferSize);
313 if (error_this_type) {
314 error += error_this_type;
315 log_error("Failure; skipping further sizes for this type.");
316 break;
317 }
318 }
319 }
320 return error;
321 }
322
323 #pragma mark -------------------- vload test cases --------------------------
324
create_global_load_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)325 void create_global_load_code(std::string &destBuffer, size_t inBufferSize,
326 ExplicitType type, size_t inVectorSize,
327 size_t outVectorSize)
328 {
329 std::snprintf(mem_type, sizeof(mem_type), "__global");
330 std::snprintf(store_str, sizeof(store_str), store_patternV3);
331 const char *typeName = get_explicit_type_name(type);
332 std::string outTypeName = typeName;
333 if (inVectorSize != 3)
334 {
335 outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
336 std::snprintf(store_str, sizeof(store_str), store_pattern);
337 }
338
339 std::string kernel_src = concat_kernel(
340 kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0]));
341 destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(),
342 typeName, (int)inVectorSize, (int)inVectorSize);
343 }
344
test_vload_global(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)345 int test_vload_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
346 {
347 return test_vset<test_vload>(device, context, queue,
348 create_global_load_code, 10240);
349 }
350
create_local_load_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)351 void create_local_load_code(std::string &destBuffer, size_t inBufferSize,
352 ExplicitType type, size_t inVectorSize,
353 size_t outVectorSize)
354 {
355 std::snprintf(store_str, sizeof(store_str), store_patternV3);
356 std::snprintf(load_str, sizeof(load_str), load_patternV3);
357 const char *typeName = get_explicit_type_name(type);
358 std::string outTypeName = typeName;
359 std::string inTypeName = typeName;
360 if (inVectorSize != 3)
361 {
362 outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
363 inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize);
364 std::snprintf(store_str, sizeof(store_str), store_pattern);
365 std::snprintf(load_str, sizeof(load_str), load_pattern);
366 }
367
368 std::string kernel_src = concat_kernel(
369 pattern_local, sizeof(pattern_local) / sizeof(pattern_local[0]));
370 destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), inTypeName.c_str(),
371 outTypeName.c_str(), (int)inBufferSize, typeName,
372 (int)inVectorSize, (int)inVectorSize, typeName);
373 }
374
test_vload_local(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)375 int test_vload_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
376 {
377 // Determine the max size of a local buffer that we can test against
378 cl_ulong localSize;
379 int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
380 test_error( error, "Unable to get max size of local memory buffer" );
381 if (localSize > 10240) localSize = 10240;
382 if (localSize > 4096)
383 localSize -= 2048;
384 else
385 localSize /= 2;
386
387 return test_vset<test_vload>(device, context, queue, create_local_load_code,
388 (size_t)localSize);
389 }
390
create_constant_load_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)391 void create_constant_load_code(std::string &destBuffer, size_t inBufferSize,
392 ExplicitType type, size_t inVectorSize,
393 size_t outVectorSize)
394 {
395 std::snprintf(mem_type, sizeof(mem_type), "__constant");
396 std::snprintf(store_str, sizeof(store_str), store_patternV3);
397 const char *typeName = get_explicit_type_name(type);
398 std::string outTypeName = typeName;
399 if (inVectorSize != 3)
400 {
401 outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
402 std::snprintf(store_str, sizeof(store_str), store_pattern);
403 }
404
405 std::string kernel_src = concat_kernel(
406 kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0]));
407 destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(),
408 typeName, (int)inVectorSize, (int)inVectorSize);
409 }
410
test_vload_constant(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)411 int test_vload_constant(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
412 {
413 // Determine the max size of a local buffer that we can test against
414 cl_ulong maxSize;
415 int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL );
416 test_error( error, "Unable to get max size of constant memory buffer" );
417 if (maxSize > 10240) maxSize = 10240;
418 if (maxSize > 4096)
419 maxSize -= 2048;
420 else
421 maxSize /= 2;
422
423 return test_vset<test_vload>(device, context, queue,
424 create_constant_load_code, (size_t)maxSize);
425 }
426
create_private_load_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t outVectorSize)427 void create_private_load_code(std::string &destBuffer, size_t inBufferSize,
428 ExplicitType type, size_t inVectorSize,
429 size_t outVectorSize)
430 {
431 std::snprintf(store_str, sizeof(store_str), store_patternV3);
432 const char *typeName = get_explicit_type_name(type);
433 std::string outTypeName = typeName;
434 std::string inTypeName = typeName;
435 int bufSize = (int)inBufferSize * 3;
436 if (inVectorSize != 3)
437 {
438 outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
439 inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize);
440 bufSize = (int)inBufferSize;
441 std::snprintf(store_str, sizeof(store_str), store_pattern);
442 }
443
444 std::string kernel_src = concat_kernel(
445 pattern_priv, sizeof(pattern_priv) / sizeof(pattern_priv[0]));
446 destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), bufSize,
447 inTypeName.c_str(), outTypeName.c_str(), typeName,
448 (int)inVectorSize, (int)inVectorSize, typeName);
449 }
450
test_vload_private(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)451 int test_vload_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
452 {
453 // We have no idea how much actual private storage is available, so just pick a reasonable value,
454 // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
455 return test_vset<test_vload>(device, context, queue,
456 create_private_load_code, 256);
457 }
458
459 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
460 #pragma mark -------------------- vstore harness --------------------------
461
test_vstore(cl_device_id device,cl_context context,cl_command_queue queue,ExplicitType type,unsigned int vecSize,create_program_fn createFn,size_t bufferSize)462 int test_vstore(cl_device_id device, cl_context context, cl_command_queue queue,
463 ExplicitType type, unsigned int vecSize,
464 create_program_fn createFn, size_t bufferSize)
465 {
466 clProgramWrapper program;
467 clKernelWrapper kernel;
468 clMemWrapper streams[ 3 ];
469 MTdataHolder d(gRandomSeed);
470
471 size_t threads[ 1 ], localThreads[ 1 ];
472 size_t numElements, typeSize, numStores = (DEBUG) ? 16 : NUM_LOADS;
473
474 pragma_str[0] = '\0';
475 if (type == kDouble)
476 std::snprintf(pragma_str, sizeof(pragma_str),
477 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
478 else if (type == kHalf)
479 std::snprintf(pragma_str, sizeof(pragma_str),
480 "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
481
482 if (DEBUG)
483 bufferSize = (bufferSize < 128) ? bufferSize : 128;
484
485 typeSize = get_explicit_type_size( type );
486 numElements = bufferSize / ( typeSize * vecSize );
487 bufferSize = numElements * typeSize * vecSize; // To account for rounding
488 if( numStores > numElements * 2 / 3 )
489 {
490 // Note: unlike load, we have to restrict the # of stores here, since all offsets must be unique for our test
491 // (Plus, we leave some room for extra values to make sure didn't get written)
492 numStores = numElements * 2 / 3;
493 if( numStores < 1 )
494 numStores = 1;
495 }
496 if (DEBUG)
497 log_info("Testing: numStores: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numStores, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
498
499 std::vector<cl_uint> offsets(numStores);
500 std::vector<char> inBuffer(numStores * typeSize * vecSize);
501
502 clProtectedArray outBuffer( numElements * typeSize * vecSize );
503 std::vector<char> referenceBuffer(numElements * typeSize * vecSize);
504
505 // Create some random input data and random offsets to load from
506 generate_random_data(type, numStores * vecSize, d, (void *)inBuffer.data());
507
508 // Note: make sure no two offsets are the same, otherwise the output would depend on
509 // the order that threads ran in, and that would be next to impossible to verify
510 std::vector<char> flags(numElements);
511 flags.assign(flags.size(), 0);
512
513 for (size_t i = 0; i < numStores; i++)
514 {
515 do
516 {
517 offsets[ i ] = (cl_uint)random_in_range( 0, (int)numElements - 2, d ); // Note: keep it one vec below the end for offset testing
518 } while( flags[ offsets[ i ] ] != 0 );
519 flags[ offsets[ i ] ] = -1;
520 if (LINEAR_OFFSETS)
521 offsets[i] = (int)i;
522 }
523 if (LINEAR_OFFSETS)
524 log_info("Offsets set to thread IDs to simplify output.\n");
525
526 std::string programSrc;
527 createFn(programSrc, numElements, type, vecSize, vecSize);
528
529 // Create our kernel
530 const char *ptr = programSrc.c_str();
531 cl_int error = create_single_kernel_helper(context, &program, &kernel, 1,
532 &ptr, "test_fn");
533 test_error( error, "Unable to create testing kernel" );
534 if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str());
535
536 // Get the number of args to differentiate the kernels with local storage. (They have 5)
537 cl_uint numArgs;
538 error = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(numArgs), &numArgs, NULL);
539 test_error( error, "clGetKernelInfo failed");
540
541 // Set up parameters
542 streams[0] =
543 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
544 numStores * typeSize * vecSize * sizeof(cl_char),
545 (void *)inBuffer.data(), &error);
546 test_error( error, "Unable to create kernel stream" );
547 streams[1] =
548 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
549 numStores * sizeof(cl_uint), offsets.data(), &error);
550 test_error( error, "Unable to create kernel stream" );
551 streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numElements * typeSize * vecSize, (void *)outBuffer, &error );
552 test_error( error, "Unable to create kernel stream" );
553
554 // Set parameters and run
555 if (numArgs == 5)
556 {
557 // We need to set the size of the local storage
558 error = clSetKernelArg(kernel, 0, bufferSize, NULL);
559 test_error( error, "clSetKernelArg for buffer failed");
560 for (size_t i = 0; i < 3; i++)
561 {
562 error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
563 test_error( error, "Unable to set kernel argument" );
564 }
565 }
566 else
567 {
568 // No local storage
569 for (size_t i = 0; i < 3; i++)
570 {
571 error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
572 if (error) log_info("%s\n", programSrc.c_str());
573 test_error( error, "Unable to set kernel argument" );
574 }
575 }
576
577 threads[ 0 ] = numStores;
578 error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] );
579 test_error( error, "Unable to get local thread size" );
580
581 // Run in a loop, changing the address offset from 0 to ( vecSize - 1 ) each time, since
582 // otherwise stores might overlap each other, and it'd be a nightmare to test!
583 for( cl_uint addressOffset = 0; addressOffset < vecSize; addressOffset++ )
584 {
585 if (DEBUG)
586 log_info("\tstore addressOffset is %d, executing with threads %d\n", addressOffset, (int)threads[0]);
587
588 // Clear the results first
589 memset( outBuffer, 0, numElements * typeSize * vecSize );
590 error = clEnqueueWriteBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
591 test_error( error, "Unable to erase result stream" );
592
593 // Set up the new offset and run
594 if (numArgs == 5)
595 error = clSetKernelArg( kernel, 3+1, sizeof( cl_uint ), &addressOffset );
596 else
597 error = clSetKernelArg( kernel, 3, sizeof( cl_uint ), &addressOffset );
598 test_error( error, "Unable to set address offset argument" );
599
600 error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL );
601 test_error( error, "Unable to exec kernel" );
602
603 // Get the results
604 error = clEnqueueReadBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
605 test_error( error, "Unable to read results" );
606
607 // Create the reference results
608 referenceBuffer.assign(referenceBuffer.size(), 0);
609 for (size_t i = 0; i < numStores; i++)
610 {
611 memcpy(&referenceBuffer[((offsets[i] * vecSize) + addressOffset)
612 * typeSize],
613 &inBuffer[i * typeSize * vecSize], typeSize * vecSize);
614 }
615
616 // Validate the results now
617 char *expected = referenceBuffer.data();
618 char *actual = (char *)(void *)outBuffer;
619
620 if (DEBUG)
621 {
622 log_info("Memory contents:\n");
623 char inString[1024];
624 char expectedString[1024], actualString[1024];
625 for (size_t i = 0; i < numElements; i++)
626 {
627 if (i < numStores)
628 {
629 log_info("buffer %3d: input: %s expected: %s got: %s (store offset %3d)", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
630 GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
631 GetDataVectorString( &(actual[i*typeSize*vecSize]), typeSize, vecSize, actualString ),
632 offsets[i]);
633 if (memcmp(&(expected[i*typeSize*vecSize]), &(actual[i*typeSize*vecSize]), typeSize * vecSize) != 0)
634 log_error(" << ERROR\n");
635 else
636 log_info("\n");
637 }
638 else
639 {
640 log_info("buffer %3d: input: %s expected: %s got: %s\n", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
641 GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
642 GetDataVectorString( &(actual[i*typeSize*vecSize]), typeSize, vecSize, actualString ));
643 }
644 }
645 }
646
647 for (size_t i = 0; i < numElements; i++)
648 {
649 if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
650 {
651 char expectedString[ 1024 ], actualString[ 1024 ];
652 log_error( "ERROR: Data sample %d for vstore of %s%d did not validate (expected {%s}, got {%s}",
653 (int)i, get_explicit_type_name( type ), vecSize, GetDataVectorString( expected, typeSize, vecSize, expectedString ),
654 GetDataVectorString( actual, typeSize, vecSize, actualString ) );
655 size_t j;
656 for( j = 0; j < numStores; j++ )
657 {
658 if( offsets[ j ] == (cl_uint)i )
659 {
660 log_error( ", stored from store #%d (of %d, offset = %d) with address offset of %d", (int)j, (int)numStores, offsets[j], (int)addressOffset );
661 break;
662 }
663 }
664 if( j == numStores )
665 log_error( ", supposed to be canary value" );
666 log_error( ")\n" );
667 return 1;
668 }
669 expected += typeSize * vecSize;
670 actual += typeSize * vecSize;
671 }
672 }
673 return 0;
674 }
675
676 #pragma mark -------------------- vstore test cases --------------------------
677
create_global_store_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t)678 void create_global_store_code(std::string &destBuffer, size_t inBufferSize,
679 ExplicitType type, size_t inVectorSize,
680 size_t /*unused*/)
681 {
682 // clang-format off
683 const char *pattern [] = {
684 pragma_str,
685 "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
686 "{\n"
687 " int tid = get_global_id( 0 );\n"
688 " vstore%d( srcValues[ tid ], offsets[ tid ], destBuffer + alignmentOffset );\n"
689 "}\n" };
690
691 const char *patternV3 [] = {
692 pragma_str,
693 "__kernel void test_fn( __global %s3 *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
694 "{\n"
695 " int tid = get_global_id( 0 );\n"
696 " if((tid&3) == 0) { // if \"tid\" is a multiple of 4 \n"
697 " vstore3( srcValues[ 3*(tid>>2) ], offsets[ tid ], destBuffer + alignmentOffset );\n"
698 " } else {\n"
699 " vstore3( vload3(tid, (__global %s *)srcValues), offsets[ tid ], destBuffer + alignmentOffset );\n"
700 " }\n"
701 "}\n" };
702 // clang-format on
703
704 const char *typeName = get_explicit_type_name(type);
705 if(inVectorSize == 3) {
706 std::string kernel_src =
707 concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
708 destBuffer = str_sprintf(kernel_src, typeName, typeName, typeName);
709 }
710 else
711 {
712 std::string kernel_src =
713 concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
714 destBuffer = str_sprintf(kernel_src, typeName, (int)inVectorSize,
715 typeName, (int)inVectorSize);
716 }
717 }
718
test_vstore_global(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)719 int test_vstore_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
720 {
721 return test_vset<test_vstore>(device, context, queue,
722 create_global_store_code, 10240);
723 }
724
create_local_store_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t)725 void create_local_store_code(std::string &destBuffer, size_t inBufferSize,
726 ExplicitType type, size_t inVectorSize,
727 size_t /*unused*/)
728 {
729 // clang-format off
730 const char *pattern[] = {
731 pragma_str,
732 "#define LOC_TYPE %s\n"
733 "#define LOC_VTYPE %s%d\n"
734 "__kernel void test_fn(__local LOC_VTYPE *sSharedStorage, __global LOC_VTYPE *srcValues, __global uint *offsets, __global LOC_VTYPE *destBuffer, uint alignmentOffset )\n"
735 "{\n"
736 " int tid = get_global_id( 0 );\n"
737 // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
738 " sSharedStorage[ offsets[tid] ] = (LOC_VTYPE)(LOC_TYPE)0;\n"
739 " sSharedStorage[ offsets[tid] +1 ] = sSharedStorage[ offsets[tid] ];\n"
740 " barrier( CLK_LOCAL_MEM_FENCE );\n"
741 "\n"
742 " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local LOC_TYPE *)sSharedStorage ) + alignmentOffset );\n"
743 "\n"
744 // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
745 // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
746 " barrier( CLK_LOCAL_MEM_FENCE );\n"
747 "\n"
748 // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
749 // otherwise, local threads would be overwriting results from other local threads
750 " int i;\n"
751 " __local LOC_TYPE *sp = (__local LOC_TYPE*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
752 " __global LOC_TYPE *dp = (__global LOC_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
753 " for( i = 0; (size_t)i < sizeof( sSharedStorage[0]) / sizeof( *sp ); i++ ) \n"
754 " dp[i] = sp[i];\n"
755 "}\n" };
756
757 const char *patternV3 [] = {
758 pragma_str,
759 "#define LOC_TYPE %s\n"
760 "__kernel void test_fn(__local LOC_TYPE *sSharedStorage, __global LOC_TYPE *srcValues, __global uint *offsets, __global LOC_TYPE *destBuffer, uint alignmentOffset )\n"
761 "{\n"
762 " int tid = get_global_id( 0 );\n"
763 // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
764 " sSharedStorage[ 3*offsets[tid] ] = (LOC_TYPE)0;\n"
765 " sSharedStorage[ 3*offsets[tid] +1 ] = \n"
766 " sSharedStorage[ 3*offsets[tid] ];\n"
767 " sSharedStorage[ 3*offsets[tid] +2 ] = \n"
768 " sSharedStorage[ 3*offsets[tid]];\n"
769 " sSharedStorage[ 3*offsets[tid] +3 ] = \n"
770 " sSharedStorage[ 3*offsets[tid]];\n"
771 " sSharedStorage[ 3*offsets[tid] +4 ] = \n"
772 " sSharedStorage[ 3*offsets[tid] ];\n"
773 " sSharedStorage[ 3*offsets[tid] +5 ] = \n"
774 " sSharedStorage[ 3*offsets[tid]];\n"
775 " barrier( CLK_LOCAL_MEM_FENCE );\n"
776 "\n"
777 " vstore3( vload3(tid,srcValues), offsets[ tid ], sSharedStorage + alignmentOffset );\n"
778 "\n"
779 // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
780 // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
781 " barrier( CLK_LOCAL_MEM_FENCE );\n"
782 "\n"
783 // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
784 // otherwise, local threads would be overwriting results from other local threads
785 " int i;\n"
786 " __local LOC_TYPE *sp = (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
787 " __global LOC_TYPE *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
788 " for( i = 0; i < 3; i++ ) \n"
789 " dp[i] = sp[i];\n"
790 "}\n" };
791 // clang-format on
792
793 const char *typeName = get_explicit_type_name(type);
794 if(inVectorSize == 3) {
795 std::string kernel_src =
796 concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
797 destBuffer = str_sprintf(kernel_src, typeName);
798 }
799 else
800 {
801 std::string kernel_src =
802 concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
803 destBuffer = str_sprintf(kernel_src, typeName, typeName,
804 (int)inVectorSize, (int)inVectorSize);
805 }
806 }
807
test_vstore_local(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)808 int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
809 {
810 // Determine the max size of a local buffer that we can test against
811 cl_ulong localSize;
812 int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
813 test_error( error, "Unable to get max size of local memory buffer" );
814 if (localSize > 10240) localSize = 10240;
815 if (localSize > 4096)
816 localSize -= 2048;
817 else
818 localSize /= 2;
819 return test_vset<test_vstore>(device, context, queue,
820 create_local_store_code, (size_t)localSize);
821 }
822
create_private_store_code(std::string & destBuffer,size_t inBufferSize,ExplicitType type,size_t inVectorSize,size_t)823 void create_private_store_code(std::string &destBuffer, size_t inBufferSize,
824 ExplicitType type, size_t inVectorSize,
825 size_t /*unused*/)
826 {
827 // clang-format off
828 const char *pattern [] = {
829 pragma_str,
830 "#define PRIV_TYPE %s\n"
831 "#define PRIV_VTYPE %s%d\n"
832 // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
833 // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
834 "\n"
835 "__kernel void test_fn( __global PRIV_VTYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n"
836 "{\n"
837 " __private PRIV_VTYPE sPrivateStorage[ %d ];\n"
838 " int tid = get_global_id( 0 );\n"
839 // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
840 " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n"
841 "\n"
842 " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n"
843 "\n"
844 // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
845 // otherwise, local threads would be overwriting results from other local threads
846 " uint i;\n"
847 " __private PRIV_TYPE *sp = (__private PRIV_TYPE*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
848 " __global PRIV_TYPE *dp = (__global PRIV_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
849 " for( i = 0; i < sizeof( sPrivateStorage[0]) / sizeof( *sp ); i++ ) \n"
850 " dp[i] = sp[i];\n"
851 "}\n"};
852
853 const char *patternV3 [] = {
854 pragma_str,
855 "#define PRIV_TYPE %s\n"
856 "#define PRIV_VTYPE %s3\n"
857 // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
858 // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
859 "\n"
860 "__kernel void test_fn( __global PRIV_TYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n"
861 "{\n"
862 " __private PRIV_VTYPE sPrivateStorage[ %d ];\n" // keep this %d
863 " int tid = get_global_id( 0 );\n"
864 // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
865 " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n"
866 "\n"
867 " vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n"
868 " uint i;\n"
869 " __private PRIV_TYPE *sp = ((__private PRIV_TYPE*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
870 " __global PRIV_TYPE *dp = ((__global PRIV_TYPE*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
871 " for( i = 0; i < 3; i++ ) \n"
872 " dp[i] = sp[i];\n"
873 "}\n"};
874 // clang-format on
875
876 const char *typeName = get_explicit_type_name(type);
877 if(inVectorSize == 3) {
878 std::string kernel_src =
879 concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
880 destBuffer =
881 str_sprintf(kernel_src, typeName, typeName, (int)inBufferSize);
882 }
883 else
884 {
885 std::string kernel_src =
886 concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
887 destBuffer =
888 str_sprintf(kernel_src, typeName, typeName, (int)inVectorSize,
889 (int)inBufferSize, (int)inVectorSize);
890 }
891 }
892
test_vstore_private(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)893 int test_vstore_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
894 {
895 // We have no idea how much actual private storage is available, so just pick a reasonable value,
896 // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
897 return test_vset<test_vstore>(device, context, queue,
898 create_private_store_code, 256);
899 }
900
901
902
903