1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "testBase.h"
17 #include "harness/conversions.h"
18 #ifndef _WIN32
19 #include <unistd.h>
20 #endif
21
22 #include <cinttypes>
23
24 #define INT_TEST_VALUE 402258822
25 #define LONG_TEST_VALUE 515154531254381446LL
26
27 // clang-format off
28 const char *atomic_global_pattern[] = {
29 "__kernel void test_atomic_fn(volatile __global %s *destMemory, __global %s *oldValues)\n"
30 "{\n"
31 " int tid = get_global_id(0);\n"
32 "\n"
33 ,
34 "\n"
35 "}\n" };
36
37 const char *atomic_local_pattern[] = {
38 "__kernel void test_atomic_fn(__global %s *finalDest, __global %s *oldValues, volatile __local %s *destMemory, int numDestItems )\n"
39 "{\n"
40 " int tid = get_global_id(0);\n"
41 " int dstItemIdx;\n"
42 "\n"
43 " // Everybody does the following line(s), but it all has the same result. We still need to ensure we sync before the atomic op, though\n"
44 " for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
45 " destMemory[ dstItemIdx ] = finalDest[ dstItemIdx ];\n"
46 " barrier( CLK_LOCAL_MEM_FENCE );\n"
47 "\n"
48 ,
49 " barrier( CLK_LOCAL_MEM_FENCE );\n"
50 " // Finally, write out the last value. Again, we're synced, so everyone will be writing the same value\n"
51 " for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
52 " finalDest[ dstItemIdx ] = destMemory[ dstItemIdx ];\n"
53 "}\n" };
54 // clang-format on
55
56
57 #define TEST_COUNT 128 * 1024
58
59
60 struct TestFns
61 {
62 cl_int mIntStartValue;
63 cl_long mLongStartValue;
64
65 size_t (*NumResultsFn)(size_t threadSize, ExplicitType dataType);
66
67 // Integer versions
68 cl_int (*ExpectedValueIntFn)(size_t size, cl_int *startRefValues,
69 size_t whichDestValue);
70 void (*GenerateRefsIntFn)(size_t size, cl_int *startRefValues, MTdata d);
71 bool (*VerifyRefsIntFn)(size_t size, cl_int *refValues, cl_int finalValue);
72
73 // Long versions
74 cl_long (*ExpectedValueLongFn)(size_t size, cl_long *startRefValues,
75 size_t whichDestValue);
76 void (*GenerateRefsLongFn)(size_t size, cl_long *startRefValues, MTdata d);
77 bool (*VerifyRefsLongFn)(size_t size, cl_long *refValues,
78 cl_long finalValue);
79
80 // Float versions
81 cl_float (*ExpectedValueFloatFn)(size_t size, cl_float *startRefValues,
82 size_t whichDestValue);
83 void (*GenerateRefsFloatFn)(size_t size, cl_float *startRefValues,
84 MTdata d);
85 bool (*VerifyRefsFloatFn)(size_t size, cl_float *refValues,
86 cl_float finalValue);
87 };
88
check_atomic_support(cl_device_id device,bool extended,bool isLocal,ExplicitType dataType)89 bool check_atomic_support(cl_device_id device, bool extended, bool isLocal,
90 ExplicitType dataType)
91 {
92 // clang-format off
93 const char *extensionNames[8] = {
94 "cl_khr_global_int32_base_atomics", "cl_khr_global_int32_extended_atomics",
95 "cl_khr_local_int32_base_atomics", "cl_khr_local_int32_extended_atomics",
96 "cl_khr_int64_base_atomics", "cl_khr_int64_extended_atomics",
97 "cl_khr_int64_base_atomics", "cl_khr_int64_extended_atomics" // this line intended to be the same as the last one
98 };
99 // clang-format on
100
101 size_t index = 0;
102 if (extended) index += 1;
103 if (isLocal) index += 2;
104
105 Version version = get_device_cl_version(device);
106
107 switch (dataType)
108 {
109 case kInt:
110 case kUInt:
111 if (version >= Version(1, 1)) return 1;
112 break;
113 case kLong:
114 case kULong: index += 4; break;
115 case kFloat: // this has to stay separate since the float atomics arent
116 // in the 1.0 extensions
117 return version >= Version(1, 1);
118 default:
119 log_error(
120 "ERROR: Unsupported data type (%d) in check_atomic_support\n",
121 dataType);
122 return 0;
123 }
124
125 return is_extension_available(device, extensionNames[index]);
126 }
127
test_atomic_function(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,const char * programCore,TestFns testFns,bool extended,bool isLocal,ExplicitType dataType,bool matchGroupSize)128 int test_atomic_function(cl_device_id deviceID, cl_context context,
129 cl_command_queue queue, int num_elements,
130 const char *programCore, TestFns testFns,
131 bool extended, bool isLocal, ExplicitType dataType,
132 bool matchGroupSize)
133 {
134 clProgramWrapper program;
135 clKernelWrapper kernel;
136 int error;
137 size_t threads[1];
138 clMemWrapper streams[2];
139 void *refValues, *startRefValues;
140 size_t threadSize, groupSize;
141 const char *programLines[4];
142 char pragma[512];
143 char programHeader[512];
144 MTdata d;
145 size_t typeSize = get_explicit_type_size(dataType);
146
147
148 // Verify we can run first
149 bool isUnsigned = (dataType == kULong) || (dataType == kUInt);
150 if (!check_atomic_support(deviceID, extended, isLocal, dataType))
151 {
152 // Only print for the signed (unsigned comes right after, and if signed
153 // isn't supported, unsigned isn't either)
154 if (dataType == kFloat)
155 log_info("\t%s float not supported\n",
156 isLocal ? "Local" : "Global");
157 else if (!isUnsigned)
158 log_info("\t%s %sint%d not supported\n",
159 isLocal ? "Local" : "Global", isUnsigned ? "u" : "",
160 (int)typeSize * 8);
161 // Since we don't support the operation, they implicitly pass
162 return 0;
163 }
164 else
165 {
166 if (dataType == kFloat)
167 log_info("\t%s float%s...", isLocal ? "local" : "global",
168 isLocal ? " " : "");
169 else
170 log_info("\t%s %sint%d%s%s...", isLocal ? "local" : "global",
171 isUnsigned ? "u" : "", (int)typeSize * 8,
172 isUnsigned ? "" : " ", isLocal ? " " : "");
173 }
174
175 //// Set up the kernel code
176
177 // Create the pragma line for this kernel
178 bool isLong = (dataType == kLong || dataType == kULong);
179 sprintf(pragma,
180 "#pragma OPENCL EXTENSION cl_khr%s_int%s_%s_atomics : enable\n",
181 isLong ? "" : (isLocal ? "_local" : "_global"),
182 isLong ? "64" : "32", extended ? "extended" : "base");
183
184 // Now create the program header
185 const char *typeName = get_explicit_type_name(dataType);
186 if (isLocal)
187 sprintf(programHeader, atomic_local_pattern[0], typeName, typeName,
188 typeName);
189 else
190 sprintf(programHeader, atomic_global_pattern[0], typeName, typeName);
191
192 // Set up our entire program now
193 programLines[0] = pragma;
194 programLines[1] = programHeader;
195 programLines[2] = programCore;
196 programLines[3] =
197 (isLocal) ? atomic_local_pattern[1] : atomic_global_pattern[1];
198
199 if (create_single_kernel_helper(context, &program, &kernel, 4, programLines,
200 "test_atomic_fn"))
201 {
202 return -1;
203 }
204
205 //// Set up to actually run
206 threadSize = num_elements;
207
208 error =
209 get_max_common_work_group_size(context, kernel, threadSize, &groupSize);
210 test_error(error, "Unable to get thread group max size");
211
212 if (matchGroupSize)
213 // HACK because xchg and cmpxchg apparently are limited by hardware
214 threadSize = groupSize;
215
216 if (isLocal)
217 {
218 size_t maxSizes[3] = { 0, 0, 0 };
219 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
220 3 * sizeof(size_t), maxSizes, 0);
221 test_error(error,
222 "Unable to obtain max work item sizes for the device");
223
224 size_t workSize;
225 error = clGetKernelWorkGroupInfo(kernel, deviceID,
226 CL_KERNEL_WORK_GROUP_SIZE,
227 sizeof(workSize), &workSize, NULL);
228 test_error(
229 error,
230 "Unable to obtain max work group size for device and kernel combo");
231
232 // Limit workSize to avoid extremely large local buffer size and slow
233 // run.
234 if (workSize > 65536) workSize = 65536;
235
236 // "workSize" is limited to that of the first dimension as only a
237 // 1DRange is executed.
238 if (maxSizes[0] < workSize)
239 {
240 workSize = maxSizes[0];
241 }
242
243 threadSize = groupSize = workSize;
244 }
245
246
247 log_info("\t(thread count %d, group size %d)\n", (int)threadSize,
248 (int)groupSize);
249
250 refValues = (cl_int *)malloc(typeSize * threadSize);
251
252 if (testFns.GenerateRefsIntFn != NULL)
253 {
254 // We have a ref generator provided
255 d = init_genrand(gRandomSeed);
256 startRefValues = malloc(typeSize * threadSize);
257 if (typeSize == 4)
258 testFns.GenerateRefsIntFn(threadSize, (cl_int *)startRefValues, d);
259 else
260 testFns.GenerateRefsLongFn(threadSize, (cl_long *)startRefValues,
261 d);
262 free_mtdata(d);
263 d = NULL;
264 }
265 else
266 startRefValues = NULL;
267
268 // If we're given a num_results function, we need to determine how many
269 // result objects we need. If we don't have it, we assume it's just 1
270 size_t numDestItems = (testFns.NumResultsFn != NULL)
271 ? testFns.NumResultsFn(threadSize, dataType)
272 : 1;
273
274 char *destItems = new char[typeSize * numDestItems];
275 if (destItems == NULL)
276 {
277 log_error("ERROR: Unable to allocate memory!\n");
278 return -1;
279 }
280 void *startValue = (typeSize == 4) ? (void *)&testFns.mIntStartValue
281 : (void *)&testFns.mLongStartValue;
282 for (size_t i = 0; i < numDestItems; i++)
283 memcpy(destItems + i * typeSize, startValue, typeSize);
284
285 streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
286 typeSize * numDestItems, destItems, NULL);
287 if (!streams[0])
288 {
289 log_error("ERROR: Creating output array failed!\n");
290 return -1;
291 }
292 streams[1] = clCreateBuffer(
293 context,
294 ((startRefValues != NULL ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
295 typeSize * threadSize, startRefValues, NULL);
296 if (!streams[1])
297 {
298 log_error("ERROR: Creating reference array failed!\n");
299 return -1;
300 }
301
302 /* Set the arguments */
303 error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
304 test_error(error, "Unable to set indexed kernel arguments");
305 error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
306 test_error(error, "Unable to set indexed kernel arguments");
307
308 if (isLocal)
309 {
310 error = clSetKernelArg(kernel, 2, typeSize * numDestItems, NULL);
311 test_error(error, "Unable to set indexed local kernel argument");
312
313 cl_int numDestItemsInt = (cl_int)numDestItems;
314 error = clSetKernelArg(kernel, 3, sizeof(cl_int), &numDestItemsInt);
315 test_error(error, "Unable to set indexed kernel argument");
316 }
317
318 /* Run the kernel */
319 threads[0] = threadSize;
320 error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, &groupSize,
321 0, NULL, NULL);
322 test_error(error, "Unable to execute test kernel");
323
324 error =
325 clEnqueueReadBuffer(queue, streams[0], true, 0, typeSize * numDestItems,
326 destItems, 0, NULL, NULL);
327 test_error(error, "Unable to read result value!");
328
329 error =
330 clEnqueueReadBuffer(queue, streams[1], true, 0, typeSize * threadSize,
331 refValues, 0, NULL, NULL);
332 test_error(error, "Unable to read reference values!");
333
334 // If we have an expectedFn, then we need to generate a final value to
335 // compare against. If we don't have one, it's because we're comparing ref
336 // values only
337 if (testFns.ExpectedValueIntFn != NULL)
338 {
339 for (size_t i = 0; i < numDestItems; i++)
340 {
341 char expected[8];
342 cl_int intVal;
343 cl_long longVal;
344 if (typeSize == 4)
345 {
346 // Int version
347 intVal = testFns.ExpectedValueIntFn(
348 threadSize, (cl_int *)startRefValues, i);
349 memcpy(expected, &intVal, sizeof(intVal));
350 }
351 else
352 {
353 // Long version
354 longVal = testFns.ExpectedValueLongFn(
355 threadSize, (cl_long *)startRefValues, i);
356 memcpy(expected, &longVal, sizeof(longVal));
357 }
358
359 if (memcmp(expected, destItems + i * typeSize, typeSize) != 0)
360 {
361 if (typeSize == 4)
362 {
363 cl_int *outValue = (cl_int *)(destItems + i * typeSize);
364 log_error("ERROR: Result %zu from kernel does not "
365 "validate! (should be %d, was %d)\n",
366 i, intVal, *outValue);
367 cl_int *startRefs = (cl_int *)startRefValues;
368 cl_int *refs = (cl_int *)refValues;
369 for (i = 0; i < threadSize; i++)
370 {
371 if (startRefs != NULL)
372 log_info(" --- %zu - %d --- %d\n", i, startRefs[i],
373 refs[i]);
374 else
375 log_info(" --- %zu --- %d\n", i, refs[i]);
376 }
377 }
378 else
379 {
380 cl_long *outValue = (cl_long *)(destItems + i * typeSize);
381 log_error("ERROR: Result %zu from kernel does not "
382 "validate! (should be %" PRId64 ", was %" PRId64
383 ")\n",
384 i, longVal, *outValue);
385 cl_long *startRefs = (cl_long *)startRefValues;
386 cl_long *refs = (cl_long *)refValues;
387 for (i = 0; i < threadSize; i++)
388 {
389 if (startRefs != NULL)
390 log_info(" --- %zu - %" PRId64 " --- %" PRId64 "\n",
391 i, startRefs[i], refs[i]);
392 else
393 log_info(" --- %zu --- %" PRId64 "\n", i, refs[i]);
394 }
395 }
396 return -1;
397 }
398 }
399 }
400
401 if (testFns.VerifyRefsIntFn != NULL)
402 {
403 /* Use the verify function to also check the results */
404 if (dataType == kFloat)
405 {
406 cl_float *outValue = (cl_float *)destItems;
407 if (!testFns.VerifyRefsFloatFn(threadSize, (cl_float *)refValues,
408 *outValue)
409 != 0)
410 {
411 log_error("ERROR: Reference values did not validate!\n");
412 return -1;
413 }
414 }
415 else if (typeSize == 4)
416 {
417 cl_int *outValue = (cl_int *)destItems;
418 if (!testFns.VerifyRefsIntFn(threadSize, (cl_int *)refValues,
419 *outValue)
420 != 0)
421 {
422 log_error("ERROR: Reference values did not validate!\n");
423 return -1;
424 }
425 }
426 else
427 {
428 cl_long *outValue = (cl_long *)destItems;
429 if (!testFns.VerifyRefsLongFn(threadSize, (cl_long *)refValues,
430 *outValue)
431 != 0)
432 {
433 log_error("ERROR: Reference values did not validate!\n");
434 return -1;
435 }
436 }
437 }
438 else if (testFns.ExpectedValueIntFn == NULL)
439 {
440 log_error("ERROR: Test doesn't check total or refs; no values are "
441 "verified!\n");
442 return -1;
443 }
444
445
446 /* Re-write the starting value */
447 for (size_t i = 0; i < numDestItems; i++)
448 memcpy(destItems + i * typeSize, startValue, typeSize);
449 error =
450 clEnqueueWriteBuffer(queue, streams[0], true, 0,
451 typeSize * numDestItems, destItems, 0, NULL, NULL);
452 test_error(error, "Unable to write starting values!");
453
454 /* Run the kernel once for a single thread, so we can verify that the
455 * returned value is the original one */
456 threads[0] = 1;
457 error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, threads, 0,
458 NULL, NULL);
459 test_error(error, "Unable to execute test kernel");
460
461 error = clEnqueueReadBuffer(queue, streams[1], true, 0, typeSize, refValues,
462 0, NULL, NULL);
463 test_error(error, "Unable to read reference values!");
464
465 if (memcmp(refValues, destItems, typeSize) != 0)
466 {
467 if (typeSize == 4)
468 {
469 cl_int *s = (cl_int *)destItems;
470 cl_int *r = (cl_int *)refValues;
471 log_error("ERROR: atomic function operated correctly but did NOT "
472 "return correct 'old' value "
473 " (should have been %d, returned %d)!\n",
474 *s, *r);
475 }
476 else
477 {
478 cl_long *s = (cl_long *)destItems;
479 cl_long *r = (cl_long *)refValues;
480 log_error("ERROR: atomic function operated correctly but did NOT "
481 "return correct 'old' value "
482 " (should have been %" PRId64 ", returned %" PRId64
483 ")!\n",
484 *s, *r);
485 }
486 return -1;
487 }
488
489 delete[] destItems;
490 free(refValues);
491 if (startRefValues != NULL) free(startRefValues);
492
493 return 0;
494 }
495
test_atomic_function_set(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,const char * programCore,TestFns testFns,bool extended,bool matchGroupSize,bool usingAtomicPrefix)496 int test_atomic_function_set(cl_device_id deviceID, cl_context context,
497 cl_command_queue queue, int num_elements,
498 const char *programCore, TestFns testFns,
499 bool extended, bool matchGroupSize,
500 bool usingAtomicPrefix)
501 {
502 log_info(" Testing %s functions...\n",
503 usingAtomicPrefix ? "atomic_" : "atom_");
504
505 int errors = 0;
506 errors |= test_atomic_function(deviceID, context, queue, num_elements,
507 programCore, testFns, extended, false, kInt,
508 matchGroupSize);
509 errors |= test_atomic_function(deviceID, context, queue, num_elements,
510 programCore, testFns, extended, false, kUInt,
511 matchGroupSize);
512 errors |= test_atomic_function(deviceID, context, queue, num_elements,
513 programCore, testFns, extended, true, kInt,
514 matchGroupSize);
515 errors |= test_atomic_function(deviceID, context, queue, num_elements,
516 programCore, testFns, extended, true, kUInt,
517 matchGroupSize);
518
519 // Only the 32 bit atomic functions use the "atomic" prefix in 1.1, the 64
520 // bit functions still use the "atom" prefix. The argument usingAtomicPrefix
521 // is set to true if programCore was generated with the "atomic" prefix.
522 if (!usingAtomicPrefix)
523 {
524 errors |= test_atomic_function(deviceID, context, queue, num_elements,
525 programCore, testFns, extended, false,
526 kLong, matchGroupSize);
527 errors |= test_atomic_function(deviceID, context, queue, num_elements,
528 programCore, testFns, extended, false,
529 kULong, matchGroupSize);
530 errors |= test_atomic_function(deviceID, context, queue, num_elements,
531 programCore, testFns, extended, true,
532 kLong, matchGroupSize);
533 errors |= test_atomic_function(deviceID, context, queue, num_elements,
534 programCore, testFns, extended, true,
535 kULong, matchGroupSize);
536 }
537
538 return errors;
539 }
540
541 #pragma mark ---- add
542
543 const char atom_add_core[] =
544 " oldValues[tid] = atom_add( &destMemory[0], tid + 3 );\n"
545 " atom_add( &destMemory[0], tid + 3 );\n"
546 " atom_add( &destMemory[0], tid + 3 );\n"
547 " atom_add( &destMemory[0], tid + 3 );\n";
548
549 const char atomic_add_core[] =
550 " oldValues[tid] = atomic_add( &destMemory[0], tid + 3 );\n"
551 " atomic_add( &destMemory[0], tid + 3 );\n"
552 " atomic_add( &destMemory[0], tid + 3 );\n"
553 " atomic_add( &destMemory[0], tid + 3 );\n";
554
test_atomic_add_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)555 cl_int test_atomic_add_result_int(size_t size, cl_int *startRefValues,
556 size_t whichDestValue)
557 {
558 cl_int total = 0;
559 for (size_t i = 0; i < size; i++) total += ((cl_int)i + 3) * 4;
560 return total;
561 }
562
test_atomic_add_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)563 cl_long test_atomic_add_result_long(size_t size, cl_long *startRefValues,
564 size_t whichDestValue)
565 {
566 cl_long total = 0;
567 for (size_t i = 0; i < size; i++) total += ((i + 3) * 4);
568 return total;
569 }
570
test_atomic_add(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)571 int test_atomic_add(cl_device_id deviceID, cl_context context,
572 cl_command_queue queue, int num_elements)
573 {
574 TestFns set = { 0,
575 0LL,
576 NULL,
577 test_atomic_add_result_int,
578 NULL,
579 NULL,
580 test_atomic_add_result_long,
581 NULL,
582 NULL };
583
584 if (test_atomic_function_set(
585 deviceID, context, queue, num_elements, atom_add_core, set, false,
586 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
587 != 0)
588 return -1;
589 if (test_atomic_function_set(
590 deviceID, context, queue, num_elements, atomic_add_core, set, false,
591 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
592 != 0)
593 return -1;
594 return 0;
595 }
596
597 #pragma mark ---- sub
598
599 const char atom_sub_core[] =
600 " oldValues[tid] = atom_sub( &destMemory[0], tid + 3 );\n";
601
602 const char atomic_sub_core[] =
603 " oldValues[tid] = atomic_sub( &destMemory[0], tid + 3 );\n";
604
test_atomic_sub_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)605 cl_int test_atomic_sub_result_int(size_t size, cl_int *startRefValues,
606 size_t whichDestValue)
607 {
608 cl_int total = INT_TEST_VALUE;
609 for (size_t i = 0; i < size; i++) total -= (cl_int)i + 3;
610 return total;
611 }
612
test_atomic_sub_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)613 cl_long test_atomic_sub_result_long(size_t size, cl_long *startRefValues,
614 size_t whichDestValue)
615 {
616 cl_long total = LONG_TEST_VALUE;
617 for (size_t i = 0; i < size; i++) total -= i + 3;
618 return total;
619 }
620
test_atomic_sub(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)621 int test_atomic_sub(cl_device_id deviceID, cl_context context,
622 cl_command_queue queue, int num_elements)
623 {
624 TestFns set = { INT_TEST_VALUE,
625 LONG_TEST_VALUE,
626 NULL,
627 test_atomic_sub_result_int,
628 NULL,
629 NULL,
630 test_atomic_sub_result_long,
631 NULL,
632 NULL };
633
634 if (test_atomic_function_set(
635 deviceID, context, queue, num_elements, atom_sub_core, set, false,
636 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
637 != 0)
638 return -1;
639 if (test_atomic_function_set(
640 deviceID, context, queue, num_elements, atomic_sub_core, set, false,
641 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
642 != 0)
643 return -1;
644 return 0;
645 }
646
647 #pragma mark ---- xchg
648
649 const char atom_xchg_core[] =
650 " oldValues[tid] = atom_xchg( &destMemory[0], tid );\n";
651
652 const char atomic_xchg_core[] =
653 " oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
654 const char atomic_xchg_float_core[] =
655 " oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
656
test_atomic_xchg_verify_int(size_t size,cl_int * refValues,cl_int finalValue)657 bool test_atomic_xchg_verify_int(size_t size, cl_int *refValues,
658 cl_int finalValue)
659 {
660 /* For xchg, each value from 0 to size - 1 should have an entry in the ref
661 * array, and ONLY one entry */
662 char *valids;
663 size_t i;
664 char originalValidCount = 0;
665
666 valids = (char *)malloc(sizeof(char) * size);
667 memset(valids, 0, sizeof(char) * size);
668
669 for (i = 0; i < size; i++)
670 {
671 if (refValues[i] == INT_TEST_VALUE)
672 {
673 // Special initial value
674 originalValidCount++;
675 continue;
676 }
677 if (refValues[i] < 0 || (size_t)refValues[i] >= size)
678 {
679 log_error(
680 "ERROR: Reference value %zu outside of valid range! (%d)\n", i,
681 refValues[i]);
682 return false;
683 }
684 valids[refValues[i]]++;
685 }
686
687 /* Note: ONE entry will have zero count. It'll be the last one that
688 executed, because that value should be the final value outputted */
689 if (valids[finalValue] > 0)
690 {
691 log_error("ERROR: Final value %d was also in ref list!\n", finalValue);
692 return false;
693 }
694 else
695 valids[finalValue] = 1; // So the following loop will be okay
696
697 /* Now check that every entry has one and only one count */
698 if (originalValidCount != 1)
699 {
700 log_error("ERROR: Starting reference value %d did not occur "
701 "once-and-only-once (occurred %d)\n",
702 65191, originalValidCount);
703 return false;
704 }
705 for (i = 0; i < size; i++)
706 {
707 if (valids[i] != 1)
708 {
709 log_error("ERROR: Reference value %zu did not occur "
710 "once-and-only-once (occurred %d)\n",
711 i, valids[i]);
712 for (size_t j = 0; j < size; j++)
713 log_info("%d: %d\n", (int)j, (int)valids[j]);
714 return false;
715 }
716 }
717
718 free(valids);
719 return true;
720 }
721
test_atomic_xchg_verify_long(size_t size,cl_long * refValues,cl_long finalValue)722 bool test_atomic_xchg_verify_long(size_t size, cl_long *refValues,
723 cl_long finalValue)
724 {
725 /* For xchg, each value from 0 to size - 1 should have an entry in the ref
726 * array, and ONLY one entry */
727 char *valids;
728 size_t i;
729 char originalValidCount = 0;
730
731 valids = (char *)malloc(sizeof(char) * size);
732 memset(valids, 0, sizeof(char) * size);
733
734 for (i = 0; i < size; i++)
735 {
736 if (refValues[i] == LONG_TEST_VALUE)
737 {
738 // Special initial value
739 originalValidCount++;
740 continue;
741 }
742 if (refValues[i] < 0 || (size_t)refValues[i] >= size)
743 {
744 log_error(
745 "ERROR: Reference value %zu outside of valid range! (%" PRId64
746 ")\n",
747 i, refValues[i]);
748 return false;
749 }
750 valids[refValues[i]]++;
751 }
752
753 /* Note: ONE entry will have zero count. It'll be the last one that
754 executed, because that value should be the final value outputted */
755 if (valids[finalValue] > 0)
756 {
757 log_error("ERROR: Final value %" PRId64 " was also in ref list!\n",
758 finalValue);
759 return false;
760 }
761 else
762 valids[finalValue] = 1; // So the following loop will be okay
763
764 /* Now check that every entry has one and only one count */
765 if (originalValidCount != 1)
766 {
767 log_error("ERROR: Starting reference value %d did not occur "
768 "once-and-only-once (occurred %d)\n",
769 65191, originalValidCount);
770 return false;
771 }
772 for (i = 0; i < size; i++)
773 {
774 if (valids[i] != 1)
775 {
776 log_error("ERROR: Reference value %zu did not occur "
777 "once-and-only-once (occurred %d)\n",
778 i, valids[i]);
779 for (size_t j = 0; j < size; j++)
780 log_info("%d: %d\n", (int)j, (int)valids[j]);
781 return false;
782 }
783 }
784
785 free(valids);
786 return true;
787 }
788
test_atomic_xchg_verify_float(size_t size,cl_float * refValues,cl_float finalValue)789 bool test_atomic_xchg_verify_float(size_t size, cl_float *refValues,
790 cl_float finalValue)
791 {
792 /* For xchg, each value from 0 to size - 1 should have an entry in the ref
793 * array, and ONLY one entry */
794 char *valids;
795 size_t i;
796 char originalValidCount = 0;
797
798 valids = (char *)malloc(sizeof(char) * size);
799 memset(valids, 0, sizeof(char) * size);
800
801 for (i = 0; i < size; i++)
802 {
803 cl_int *intRefValue = (cl_int *)(&refValues[i]);
804 if (*intRefValue == INT_TEST_VALUE)
805 {
806 // Special initial value
807 originalValidCount++;
808 continue;
809 }
810 if (refValues[i] < 0 || (size_t)refValues[i] >= size)
811 {
812 log_error(
813 "ERROR: Reference value %zu outside of valid range! (%a)\n", i,
814 refValues[i]);
815 return false;
816 }
817 valids[(int)refValues[i]]++;
818 }
819
820 /* Note: ONE entry will have zero count. It'll be the last one that
821 executed, because that value should be the final value outputted */
822 if (valids[(int)finalValue] > 0)
823 {
824 log_error("ERROR: Final value %a was also in ref list!\n", finalValue);
825 return false;
826 }
827 else
828 valids[(int)finalValue] = 1; // So the following loop will be okay
829
830 /* Now check that every entry has one and only one count */
831 if (originalValidCount != 1)
832 {
833 log_error("ERROR: Starting reference value %d did not occur "
834 "once-and-only-once (occurred %d)\n",
835 65191, originalValidCount);
836 return false;
837 }
838 for (i = 0; i < size; i++)
839 {
840 if (valids[i] != 1)
841 {
842 log_error("ERROR: Reference value %zu did not occur "
843 "once-and-only-once (occurred %d)\n",
844 i, valids[i]);
845 for (size_t j = 0; j < size; j++)
846 log_info("%d: %d\n", (int)j, (int)valids[j]);
847 return false;
848 }
849 }
850
851 free(valids);
852 return true;
853 }
854
test_atomic_xchg(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)855 int test_atomic_xchg(cl_device_id deviceID, cl_context context,
856 cl_command_queue queue, int num_elements)
857 {
858 TestFns set = { INT_TEST_VALUE,
859 LONG_TEST_VALUE,
860 NULL,
861 NULL,
862 NULL,
863 test_atomic_xchg_verify_int,
864 NULL,
865 NULL,
866 test_atomic_xchg_verify_long,
867 NULL,
868 NULL,
869 test_atomic_xchg_verify_float };
870
871 int errors = test_atomic_function_set(
872 deviceID, context, queue, num_elements, atom_xchg_core, set, false,
873 true, /*usingAtomicPrefix*/ false);
874 errors |= test_atomic_function_set(deviceID, context, queue, num_elements,
875 atomic_xchg_core, set, false, true,
876 /*usingAtomicPrefix*/ true);
877
878 errors |= test_atomic_function(deviceID, context, queue, num_elements,
879 atomic_xchg_float_core, set, false, false,
880 kFloat, true);
881 errors |= test_atomic_function(deviceID, context, queue, num_elements,
882 atomic_xchg_float_core, set, false, true,
883 kFloat, true);
884
885 return errors;
886 }
887
888
889 #pragma mark ---- min
890
891 const char atom_min_core[] =
892 " oldValues[tid] = atom_min( &destMemory[0], oldValues[tid] );\n";
893
894 const char atomic_min_core[] =
895 " oldValues[tid] = atomic_min( &destMemory[0], oldValues[tid] );\n";
896
test_atomic_min_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)897 cl_int test_atomic_min_result_int(size_t size, cl_int *startRefValues,
898 size_t whichDestValue)
899 {
900 cl_int total = 0x7fffffffL;
901 for (size_t i = 0; i < size; i++)
902 {
903 if (startRefValues[i] < total) total = startRefValues[i];
904 }
905 return total;
906 }
907
test_atomic_min_gen_int(size_t size,cl_int * startRefValues,MTdata d)908 void test_atomic_min_gen_int(size_t size, cl_int *startRefValues, MTdata d)
909 {
910 for (size_t i = 0; i < size; i++)
911 startRefValues[i] =
912 (cl_int)(genrand_int32(d) % 0x3fffffff) + 0x3fffffff;
913 }
914
test_atomic_min_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)915 cl_long test_atomic_min_result_long(size_t size, cl_long *startRefValues,
916 size_t whichDestValue)
917 {
918 cl_long total = 0x7fffffffffffffffLL;
919 for (size_t i = 0; i < size; i++)
920 {
921 if (startRefValues[i] < total) total = startRefValues[i];
922 }
923 return total;
924 }
925
test_atomic_min_gen_long(size_t size,cl_long * startRefValues,MTdata d)926 void test_atomic_min_gen_long(size_t size, cl_long *startRefValues, MTdata d)
927 {
928 for (size_t i = 0; i < size; i++)
929 startRefValues[i] =
930 (cl_long)(genrand_int32(d)
931 | (((cl_long)genrand_int32(d) & 0x7fffffffL) << 16));
932 }
933
test_atomic_min(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)934 int test_atomic_min(cl_device_id deviceID, cl_context context,
935 cl_command_queue queue, int num_elements)
936 {
937 TestFns set = { 0x7fffffffL,
938 0x7fffffffffffffffLL,
939 NULL,
940 test_atomic_min_result_int,
941 test_atomic_min_gen_int,
942 NULL,
943 test_atomic_min_result_long,
944 test_atomic_min_gen_long,
945 NULL };
946
947 if (test_atomic_function_set(
948 deviceID, context, queue, num_elements, atom_min_core, set, true,
949 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
950 != 0)
951 return -1;
952 if (test_atomic_function_set(
953 deviceID, context, queue, num_elements, atomic_min_core, set, true,
954 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
955 != 0)
956 return -1;
957 return 0;
958 }
959
960
961 #pragma mark ---- max
962
963 const char atom_max_core[] =
964 " oldValues[tid] = atom_max( &destMemory[0], oldValues[tid] );\n";
965
966 const char atomic_max_core[] =
967 " oldValues[tid] = atomic_max( &destMemory[0], oldValues[tid] );\n";
968
test_atomic_max_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)969 cl_int test_atomic_max_result_int(size_t size, cl_int *startRefValues,
970 size_t whichDestValue)
971 {
972 cl_int total = 0;
973 for (size_t i = 0; i < size; i++)
974 {
975 if (startRefValues[i] > total) total = startRefValues[i];
976 }
977 return total;
978 }
979
test_atomic_max_gen_int(size_t size,cl_int * startRefValues,MTdata d)980 void test_atomic_max_gen_int(size_t size, cl_int *startRefValues, MTdata d)
981 {
982 for (size_t i = 0; i < size; i++)
983 startRefValues[i] =
984 (cl_int)(genrand_int32(d) % 0x3fffffff) + 0x3fffffff;
985 }
986
test_atomic_max_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)987 cl_long test_atomic_max_result_long(size_t size, cl_long *startRefValues,
988 size_t whichDestValue)
989 {
990 cl_long total = 0;
991 for (size_t i = 0; i < size; i++)
992 {
993 if (startRefValues[i] > total) total = startRefValues[i];
994 }
995 return total;
996 }
997
test_atomic_max_gen_long(size_t size,cl_long * startRefValues,MTdata d)998 void test_atomic_max_gen_long(size_t size, cl_long *startRefValues, MTdata d)
999 {
1000 for (size_t i = 0; i < size; i++)
1001 startRefValues[i] =
1002 (cl_long)(genrand_int32(d)
1003 | (((cl_long)genrand_int32(d) & 0x7fffffffL) << 16));
1004 }
1005
test_atomic_max(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1006 int test_atomic_max(cl_device_id deviceID, cl_context context,
1007 cl_command_queue queue, int num_elements)
1008 {
1009 TestFns set = { 0,
1010 0,
1011 NULL,
1012 test_atomic_max_result_int,
1013 test_atomic_max_gen_int,
1014 NULL,
1015 test_atomic_max_result_long,
1016 test_atomic_max_gen_long,
1017 NULL };
1018
1019 if (test_atomic_function_set(
1020 deviceID, context, queue, num_elements, atom_max_core, set, true,
1021 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
1022 != 0)
1023 return -1;
1024 if (test_atomic_function_set(
1025 deviceID, context, queue, num_elements, atomic_max_core, set, true,
1026 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
1027 != 0)
1028 return -1;
1029 return 0;
1030 }
1031
1032
1033 #pragma mark ---- inc
1034
1035 const char atom_inc_core[] =
1036 " oldValues[tid] = atom_inc( &destMemory[0] );\n";
1037
1038 const char atomic_inc_core[] =
1039 " oldValues[tid] = atomic_inc( &destMemory[0] );\n";
1040
test_atomic_inc_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)1041 cl_int test_atomic_inc_result_int(size_t size, cl_int *startRefValues,
1042 size_t whichDestValue)
1043 {
1044 return INT_TEST_VALUE + (cl_int)size;
1045 }
1046
test_atomic_inc_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)1047 cl_long test_atomic_inc_result_long(size_t size, cl_long *startRefValues,
1048 size_t whichDestValue)
1049 {
1050 return LONG_TEST_VALUE + size;
1051 }
1052
test_atomic_inc(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1053 int test_atomic_inc(cl_device_id deviceID, cl_context context,
1054 cl_command_queue queue, int num_elements)
1055 {
1056 TestFns set = { INT_TEST_VALUE,
1057 LONG_TEST_VALUE,
1058 NULL,
1059 test_atomic_inc_result_int,
1060 NULL,
1061 NULL,
1062 test_atomic_inc_result_long,
1063 NULL,
1064 NULL };
1065
1066 if (test_atomic_function_set(
1067 deviceID, context, queue, num_elements, atom_inc_core, set, false,
1068 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
1069 != 0)
1070 return -1;
1071 if (test_atomic_function_set(
1072 deviceID, context, queue, num_elements, atomic_inc_core, set, false,
1073 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
1074 != 0)
1075 return -1;
1076 return 0;
1077 }
1078
1079
1080 #pragma mark ---- dec
1081
1082 const char atom_dec_core[] =
1083 " oldValues[tid] = atom_dec( &destMemory[0] );\n";
1084
1085 const char atomic_dec_core[] =
1086 " oldValues[tid] = atomic_dec( &destMemory[0] );\n";
1087
test_atomic_dec_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)1088 cl_int test_atomic_dec_result_int(size_t size, cl_int *startRefValues,
1089 size_t whichDestValue)
1090 {
1091 return INT_TEST_VALUE - (cl_int)size;
1092 }
1093
test_atomic_dec_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)1094 cl_long test_atomic_dec_result_long(size_t size, cl_long *startRefValues,
1095 size_t whichDestValue)
1096 {
1097 return LONG_TEST_VALUE - size;
1098 }
1099
test_atomic_dec(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1100 int test_atomic_dec(cl_device_id deviceID, cl_context context,
1101 cl_command_queue queue, int num_elements)
1102 {
1103 TestFns set = { INT_TEST_VALUE,
1104 LONG_TEST_VALUE,
1105 NULL,
1106 test_atomic_dec_result_int,
1107 NULL,
1108 NULL,
1109 test_atomic_dec_result_long,
1110 NULL,
1111 NULL };
1112
1113 if (test_atomic_function_set(
1114 deviceID, context, queue, num_elements, atom_dec_core, set, false,
1115 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
1116 != 0)
1117 return -1;
1118 if (test_atomic_function_set(
1119 deviceID, context, queue, num_elements, atomic_dec_core, set, false,
1120 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
1121 != 0)
1122 return -1;
1123 return 0;
1124 }
1125
1126
1127 #pragma mark ---- cmpxchg
1128
1129 /* We test cmpxchg by implementing (the long way) atom_add */
1130 // clang-format off
1131 const char atom_cmpxchg_core[] =
1132 " int oldValue, origValue, newValue;\n"
1133 " do { \n"
1134 " origValue = destMemory[0];\n"
1135 " newValue = origValue + tid + 2;\n"
1136 " oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
1137 " } while( oldValue != origValue );\n"
1138 " oldValues[tid] = oldValue;\n";
1139
1140 const char atom_cmpxchg64_core[] =
1141 " long oldValue, origValue, newValue;\n"
1142 " do { \n"
1143 " origValue = destMemory[0];\n"
1144 " newValue = origValue + tid + 2;\n"
1145 " oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
1146 " } while( oldValue != origValue );\n"
1147 " oldValues[tid] = oldValue;\n";
1148
1149 const char atomic_cmpxchg_core[] =
1150 " int oldValue, origValue, newValue;\n"
1151 " do { \n"
1152 " origValue = destMemory[0];\n"
1153 " newValue = origValue + tid + 2;\n"
1154 " oldValue = atomic_cmpxchg( &destMemory[0], origValue, newValue );\n"
1155 " } while( oldValue != origValue );\n"
1156 " oldValues[tid] = oldValue;\n";
1157 // clang-format on
1158
test_atomic_cmpxchg_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)1159 cl_int test_atomic_cmpxchg_result_int(size_t size, cl_int *startRefValues,
1160 size_t whichDestValue)
1161 {
1162 cl_int total = INT_TEST_VALUE;
1163 for (size_t i = 0; i < size; i++) total += (cl_int)i + 2;
1164 return total;
1165 }
1166
test_atomic_cmpxchg_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)1167 cl_long test_atomic_cmpxchg_result_long(size_t size, cl_long *startRefValues,
1168 size_t whichDestValue)
1169 {
1170 cl_long total = LONG_TEST_VALUE;
1171 for (size_t i = 0; i < size; i++) total += i + 2;
1172 return total;
1173 }
1174
test_atomic_cmpxchg(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1175 int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context,
1176 cl_command_queue queue, int num_elements)
1177 {
1178 TestFns set = { INT_TEST_VALUE,
1179 LONG_TEST_VALUE,
1180 NULL,
1181 test_atomic_cmpxchg_result_int,
1182 NULL,
1183 NULL,
1184 test_atomic_cmpxchg_result_long,
1185 NULL,
1186 NULL };
1187
1188 int errors = 0;
1189
1190 log_info(" Testing atom_ functions...\n");
1191 errors |=
1192 test_atomic_function(deviceID, context, queue, num_elements,
1193 atom_cmpxchg_core, set, false, false, kInt, true);
1194 errors |=
1195 test_atomic_function(deviceID, context, queue, num_elements,
1196 atom_cmpxchg_core, set, false, false, kUInt, true);
1197 errors |=
1198 test_atomic_function(deviceID, context, queue, num_elements,
1199 atom_cmpxchg_core, set, false, true, kInt, true);
1200 errors |=
1201 test_atomic_function(deviceID, context, queue, num_elements,
1202 atom_cmpxchg_core, set, false, true, kUInt, true);
1203
1204 errors |= test_atomic_function(deviceID, context, queue, num_elements,
1205 atom_cmpxchg64_core, set, false, false,
1206 kLong, true);
1207 errors |= test_atomic_function(deviceID, context, queue, num_elements,
1208 atom_cmpxchg64_core, set, false, false,
1209 kULong, true);
1210 errors |= test_atomic_function(deviceID, context, queue, num_elements,
1211 atom_cmpxchg64_core, set, false, true, kLong,
1212 true);
1213 errors |= test_atomic_function(deviceID, context, queue, num_elements,
1214 atom_cmpxchg64_core, set, false, true,
1215 kULong, true);
1216
1217 log_info(" Testing atomic_ functions...\n");
1218 errors |= test_atomic_function(deviceID, context, queue, num_elements,
1219 atomic_cmpxchg_core, set, false, false, kInt,
1220 true);
1221 errors |= test_atomic_function(deviceID, context, queue, num_elements,
1222 atomic_cmpxchg_core, set, false, false,
1223 kUInt, true);
1224 errors |=
1225 test_atomic_function(deviceID, context, queue, num_elements,
1226 atomic_cmpxchg_core, set, false, true, kInt, true);
1227 errors |= test_atomic_function(deviceID, context, queue, num_elements,
1228 atomic_cmpxchg_core, set, false, true, kUInt,
1229 true);
1230
1231 if (errors) return -1;
1232
1233 return 0;
1234 }
1235
1236 #pragma mark -------- Bitwise functions
1237
test_bitwise_num_results(size_t threadCount,ExplicitType dataType)1238 size_t test_bitwise_num_results(size_t threadCount, ExplicitType dataType)
1239 {
1240 size_t numBits = get_explicit_type_size(dataType) * 8;
1241
1242 return (threadCount + numBits - 1) / numBits;
1243 }
1244
1245 #pragma mark ---- and
1246
1247 // clang-format off
1248 const char atom_and_core[] =
1249 " size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1250 " int whichResult = tid / numBits;\n"
1251 " int bitIndex = tid - ( whichResult * numBits );\n"
1252 "\n"
1253 " oldValues[tid] = atom_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n";
1254
1255 const char atomic_and_core[] =
1256 " size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1257 " int whichResult = tid / numBits;\n"
1258 " int bitIndex = tid - ( whichResult * numBits );\n"
1259 "\n"
1260 " oldValues[tid] = atomic_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n";
1261 // clang-format on
1262
1263
test_atomic_and_result_int(size_t size,cl_int * startRefValues,size_t whichResult)1264 cl_int test_atomic_and_result_int(size_t size, cl_int *startRefValues,
1265 size_t whichResult)
1266 {
1267 size_t numThreads = ((size_t)size + 31) / 32;
1268 if (whichResult < numThreads - 1) return 0;
1269
1270 // Last item doesn't get and'ed on every bit, so we have to mask away
1271 size_t numBits = (size_t)size - whichResult * 32;
1272 cl_int bits = (cl_int)0xffffffffL;
1273 for (size_t i = 0; i < numBits; i++) bits &= ~(1 << i);
1274
1275 return bits;
1276 }
1277
test_atomic_and_result_long(size_t size,cl_long * startRefValues,size_t whichResult)1278 cl_long test_atomic_and_result_long(size_t size, cl_long *startRefValues,
1279 size_t whichResult)
1280 {
1281 size_t numThreads = ((size_t)size + 63) / 64;
1282 if (whichResult < numThreads - 1) return 0;
1283
1284 // Last item doesn't get and'ed on every bit, so we have to mask away
1285 size_t numBits = (size_t)size - whichResult * 64;
1286 cl_long bits = (cl_long)0xffffffffffffffffLL;
1287 for (size_t i = 0; i < numBits; i++) bits &= ~(1LL << i);
1288
1289 return bits;
1290 }
1291
test_atomic_and(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1292 int test_atomic_and(cl_device_id deviceID, cl_context context,
1293 cl_command_queue queue, int num_elements)
1294 {
1295 TestFns set = { 0xffffffff,
1296 0xffffffffffffffffLL,
1297 test_bitwise_num_results,
1298 test_atomic_and_result_int,
1299 NULL,
1300 NULL,
1301 test_atomic_and_result_long,
1302 NULL,
1303 NULL };
1304
1305 if (test_atomic_function_set(
1306 deviceID, context, queue, num_elements, atom_and_core, set, true,
1307 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
1308 != 0)
1309 return -1;
1310 if (test_atomic_function_set(
1311 deviceID, context, queue, num_elements, atomic_and_core, set, true,
1312 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
1313 != 0)
1314 return -1;
1315 return 0;
1316 }
1317
1318
1319 #pragma mark ---- or
1320
1321 // clang-format off
1322 const char atom_or_core[] =
1323 " size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1324 " int whichResult = tid / numBits;\n"
1325 " int bitIndex = tid - ( whichResult * numBits );\n"
1326 "\n"
1327 " oldValues[tid] = atom_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n";
1328
1329 const char atomic_or_core[] =
1330 " size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1331 " int whichResult = tid / numBits;\n"
1332 " int bitIndex = tid - ( whichResult * numBits );\n"
1333 "\n"
1334 " oldValues[tid] = atomic_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n";
1335 // clang-format on
1336
test_atomic_or_result_int(size_t size,cl_int * startRefValues,size_t whichResult)1337 cl_int test_atomic_or_result_int(size_t size, cl_int *startRefValues,
1338 size_t whichResult)
1339 {
1340 size_t numThreads = ((size_t)size + 31) / 32;
1341 if (whichResult < numThreads - 1) return 0xffffffff;
1342
1343 // Last item doesn't get and'ed on every bit, so we have to mask away
1344 size_t numBits = (size_t)size - whichResult * 32;
1345 cl_int bits = 0;
1346 for (size_t i = 0; i < numBits; i++) bits |= (1 << i);
1347
1348 return bits;
1349 }
1350
test_atomic_or_result_long(size_t size,cl_long * startRefValues,size_t whichResult)1351 cl_long test_atomic_or_result_long(size_t size, cl_long *startRefValues,
1352 size_t whichResult)
1353 {
1354 size_t numThreads = ((size_t)size + 63) / 64;
1355 if (whichResult < numThreads - 1) return 0x0ffffffffffffffffLL;
1356
1357 // Last item doesn't get and'ed on every bit, so we have to mask away
1358 size_t numBits = (size_t)size - whichResult * 64;
1359 cl_long bits = 0;
1360 for (size_t i = 0; i < numBits; i++) bits |= (1LL << i);
1361
1362 return bits;
1363 }
1364
test_atomic_or(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1365 int test_atomic_or(cl_device_id deviceID, cl_context context,
1366 cl_command_queue queue, int num_elements)
1367 {
1368 TestFns set = {
1369 0, 0LL, test_bitwise_num_results, test_atomic_or_result_int,
1370 NULL, NULL, test_atomic_or_result_long, NULL,
1371 NULL
1372 };
1373
1374 if (test_atomic_function_set(
1375 deviceID, context, queue, num_elements, atom_or_core, set, true,
1376 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
1377 != 0)
1378 return -1;
1379 if (test_atomic_function_set(
1380 deviceID, context, queue, num_elements, atomic_or_core, set, true,
1381 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
1382 != 0)
1383 return -1;
1384 return 0;
1385 }
1386
1387
1388 #pragma mark ---- xor
1389
1390 const char atom_xor_core[] =
1391 " size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1392 " int bitIndex = tid & ( numBits - 1 );\n"
1393 "\n"
1394 " oldValues[tid] = atom_xor( &destMemory[0], 1L << bitIndex );\n";
1395
1396 const char atomic_xor_core[] =
1397 " size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1398 " int bitIndex = tid & ( numBits - 1 );\n"
1399 "\n"
1400 " oldValues[tid] = atomic_xor( &destMemory[0], 1L << bitIndex );\n";
1401
test_atomic_xor_result_int(size_t size,cl_int * startRefValues,size_t whichResult)1402 cl_int test_atomic_xor_result_int(size_t size, cl_int *startRefValues,
1403 size_t whichResult)
1404 {
1405 cl_int total = 0x2f08ab41;
1406 for (size_t i = 0; i < size; i++) total ^= (1 << (i & 31));
1407 return total;
1408 }
1409
test_atomic_xor_result_long(size_t size,cl_long * startRefValues,size_t whichResult)1410 cl_long test_atomic_xor_result_long(size_t size, cl_long *startRefValues,
1411 size_t whichResult)
1412 {
1413 cl_long total = 0x2f08ab418ba0541LL;
1414 for (size_t i = 0; i < size; i++) total ^= (1LL << (i & 63));
1415 return total;
1416 }
1417
test_atomic_xor(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1418 int test_atomic_xor(cl_device_id deviceID, cl_context context,
1419 cl_command_queue queue, int num_elements)
1420 {
1421 TestFns set = { 0x2f08ab41,
1422 0x2f08ab418ba0541LL,
1423 NULL,
1424 test_atomic_xor_result_int,
1425 NULL,
1426 NULL,
1427 test_atomic_xor_result_long,
1428 NULL,
1429 NULL };
1430
1431 if (test_atomic_function_set(
1432 deviceID, context, queue, num_elements, atom_xor_core, set, true,
1433 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
1434 != 0)
1435 return -1;
1436 if (test_atomic_function_set(
1437 deviceID, context, queue, num_elements, atomic_xor_core, set, true,
1438 /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
1439 != 0)
1440 return -1;
1441 return 0;
1442 }
1443