xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/atomics/test_atomics.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "testBase.h"
17 #include "harness/conversions.h"
18 #ifndef _WIN32
19 #include <unistd.h>
20 #endif
21 
22 #include <cinttypes>
23 
24 #define INT_TEST_VALUE 402258822
25 #define LONG_TEST_VALUE 515154531254381446LL
26 
27 // clang-format off
28 const char *atomic_global_pattern[] = {
29     "__kernel void test_atomic_fn(volatile __global %s *destMemory, __global %s *oldValues)\n"
30     "{\n"
31     "    int  tid = get_global_id(0);\n"
32     "\n"
33     ,
34     "\n"
35     "}\n" };
36 
37 const char *atomic_local_pattern[] = {
38     "__kernel void test_atomic_fn(__global %s *finalDest, __global %s *oldValues, volatile __local %s *destMemory, int numDestItems )\n"
39     "{\n"
40     "    int  tid = get_global_id(0);\n"
41     "    int  dstItemIdx;\n"
42     "\n"
43     "    // Everybody does the following line(s), but it all has the same result. We still need to ensure we sync before the atomic op, though\n"
44     "    for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
45     "        destMemory[ dstItemIdx ] = finalDest[ dstItemIdx ];\n"
46     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
47     "\n"
48     ,
49     "    barrier( CLK_LOCAL_MEM_FENCE );\n"
50     "    // Finally, write out the last value. Again, we're synced, so everyone will be writing the same value\n"
51     "    for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
52     "        finalDest[ dstItemIdx ] = destMemory[ dstItemIdx ];\n"
53     "}\n" };
54 // clang-format on
55 
56 
57 #define TEST_COUNT 128 * 1024
58 
59 
60 struct TestFns
61 {
62     cl_int mIntStartValue;
63     cl_long mLongStartValue;
64 
65     size_t (*NumResultsFn)(size_t threadSize, ExplicitType dataType);
66 
67     // Integer versions
68     cl_int (*ExpectedValueIntFn)(size_t size, cl_int *startRefValues,
69                                  size_t whichDestValue);
70     void (*GenerateRefsIntFn)(size_t size, cl_int *startRefValues, MTdata d);
71     bool (*VerifyRefsIntFn)(size_t size, cl_int *refValues, cl_int finalValue);
72 
73     // Long versions
74     cl_long (*ExpectedValueLongFn)(size_t size, cl_long *startRefValues,
75                                    size_t whichDestValue);
76     void (*GenerateRefsLongFn)(size_t size, cl_long *startRefValues, MTdata d);
77     bool (*VerifyRefsLongFn)(size_t size, cl_long *refValues,
78                              cl_long finalValue);
79 
80     // Float versions
81     cl_float (*ExpectedValueFloatFn)(size_t size, cl_float *startRefValues,
82                                      size_t whichDestValue);
83     void (*GenerateRefsFloatFn)(size_t size, cl_float *startRefValues,
84                                 MTdata d);
85     bool (*VerifyRefsFloatFn)(size_t size, cl_float *refValues,
86                               cl_float finalValue);
87 };
88 
check_atomic_support(cl_device_id device,bool extended,bool isLocal,ExplicitType dataType)89 bool check_atomic_support(cl_device_id device, bool extended, bool isLocal,
90                           ExplicitType dataType)
91 {
92     // clang-format off
93     const char *extensionNames[8] = {
94         "cl_khr_global_int32_base_atomics", "cl_khr_global_int32_extended_atomics",
95         "cl_khr_local_int32_base_atomics",  "cl_khr_local_int32_extended_atomics",
96         "cl_khr_int64_base_atomics",        "cl_khr_int64_extended_atomics",
97         "cl_khr_int64_base_atomics",        "cl_khr_int64_extended_atomics"       // this line intended to be the same as the last one
98     };
99     // clang-format on
100 
101     size_t index = 0;
102     if (extended) index += 1;
103     if (isLocal) index += 2;
104 
105     Version version = get_device_cl_version(device);
106 
107     switch (dataType)
108     {
109         case kInt:
110         case kUInt:
111             if (version >= Version(1, 1)) return 1;
112             break;
113         case kLong:
114         case kULong: index += 4; break;
115         case kFloat: // this has to stay separate since the float atomics arent
116                      // in the 1.0 extensions
117             return version >= Version(1, 1);
118         default:
119             log_error(
120                 "ERROR:  Unsupported data type (%d) in check_atomic_support\n",
121                 dataType);
122             return 0;
123     }
124 
125     return is_extension_available(device, extensionNames[index]);
126 }
127 
test_atomic_function(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,const char * programCore,TestFns testFns,bool extended,bool isLocal,ExplicitType dataType,bool matchGroupSize)128 int test_atomic_function(cl_device_id deviceID, cl_context context,
129                          cl_command_queue queue, int num_elements,
130                          const char *programCore, TestFns testFns,
131                          bool extended, bool isLocal, ExplicitType dataType,
132                          bool matchGroupSize)
133 {
134     clProgramWrapper program;
135     clKernelWrapper kernel;
136     int error;
137     size_t threads[1];
138     clMemWrapper streams[2];
139     void *refValues, *startRefValues;
140     size_t threadSize, groupSize;
141     const char *programLines[4];
142     char pragma[512];
143     char programHeader[512];
144     MTdata d;
145     size_t typeSize = get_explicit_type_size(dataType);
146 
147 
148     // Verify we can run first
149     bool isUnsigned = (dataType == kULong) || (dataType == kUInt);
150     if (!check_atomic_support(deviceID, extended, isLocal, dataType))
151     {
152         // Only print for the signed (unsigned comes right after, and if signed
153         // isn't supported, unsigned isn't either)
154         if (dataType == kFloat)
155             log_info("\t%s float not supported\n",
156                      isLocal ? "Local" : "Global");
157         else if (!isUnsigned)
158             log_info("\t%s %sint%d not supported\n",
159                      isLocal ? "Local" : "Global", isUnsigned ? "u" : "",
160                      (int)typeSize * 8);
161         // Since we don't support the operation, they implicitly pass
162         return 0;
163     }
164     else
165     {
166         if (dataType == kFloat)
167             log_info("\t%s float%s...", isLocal ? "local" : "global",
168                      isLocal ? " " : "");
169         else
170             log_info("\t%s %sint%d%s%s...", isLocal ? "local" : "global",
171                      isUnsigned ? "u" : "", (int)typeSize * 8,
172                      isUnsigned ? "" : " ", isLocal ? " " : "");
173     }
174 
175     //// Set up the kernel code
176 
177     // Create the pragma line for this kernel
178     bool isLong = (dataType == kLong || dataType == kULong);
179     sprintf(pragma,
180             "#pragma OPENCL EXTENSION cl_khr%s_int%s_%s_atomics : enable\n",
181             isLong ? "" : (isLocal ? "_local" : "_global"),
182             isLong ? "64" : "32", extended ? "extended" : "base");
183 
184     // Now create the program header
185     const char *typeName = get_explicit_type_name(dataType);
186     if (isLocal)
187         sprintf(programHeader, atomic_local_pattern[0], typeName, typeName,
188                 typeName);
189     else
190         sprintf(programHeader, atomic_global_pattern[0], typeName, typeName);
191 
192     // Set up our entire program now
193     programLines[0] = pragma;
194     programLines[1] = programHeader;
195     programLines[2] = programCore;
196     programLines[3] =
197         (isLocal) ? atomic_local_pattern[1] : atomic_global_pattern[1];
198 
199     if (create_single_kernel_helper(context, &program, &kernel, 4, programLines,
200                                     "test_atomic_fn"))
201     {
202         return -1;
203     }
204 
205     //// Set up to actually run
206     threadSize = num_elements;
207 
208     error =
209         get_max_common_work_group_size(context, kernel, threadSize, &groupSize);
210     test_error(error, "Unable to get thread group max size");
211 
212     if (matchGroupSize)
213         // HACK because xchg and cmpxchg apparently are limited by hardware
214         threadSize = groupSize;
215 
216     if (isLocal)
217     {
218         size_t maxSizes[3] = { 0, 0, 0 };
219         error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
220                                 3 * sizeof(size_t), maxSizes, 0);
221         test_error(error,
222                    "Unable to obtain max work item sizes for the device");
223 
224         size_t workSize;
225         error = clGetKernelWorkGroupInfo(kernel, deviceID,
226                                          CL_KERNEL_WORK_GROUP_SIZE,
227                                          sizeof(workSize), &workSize, NULL);
228         test_error(
229             error,
230             "Unable to obtain max work group size for device and kernel combo");
231 
232         // Limit workSize to avoid extremely large local buffer size and slow
233         // run.
234         if (workSize > 65536) workSize = 65536;
235 
236         // "workSize" is limited to that of the first dimension as only a
237         // 1DRange is executed.
238         if (maxSizes[0] < workSize)
239         {
240             workSize = maxSizes[0];
241         }
242 
243         threadSize = groupSize = workSize;
244     }
245 
246 
247     log_info("\t(thread count %d, group size %d)\n", (int)threadSize,
248              (int)groupSize);
249 
250     refValues = (cl_int *)malloc(typeSize * threadSize);
251 
252     if (testFns.GenerateRefsIntFn != NULL)
253     {
254         // We have a ref generator provided
255         d = init_genrand(gRandomSeed);
256         startRefValues = malloc(typeSize * threadSize);
257         if (typeSize == 4)
258             testFns.GenerateRefsIntFn(threadSize, (cl_int *)startRefValues, d);
259         else
260             testFns.GenerateRefsLongFn(threadSize, (cl_long *)startRefValues,
261                                        d);
262         free_mtdata(d);
263         d = NULL;
264     }
265     else
266         startRefValues = NULL;
267 
268     // If we're given a num_results function, we need to determine how many
269     // result objects we need. If we don't have it, we assume it's just 1
270     size_t numDestItems = (testFns.NumResultsFn != NULL)
271         ? testFns.NumResultsFn(threadSize, dataType)
272         : 1;
273 
274     char *destItems = new char[typeSize * numDestItems];
275     if (destItems == NULL)
276     {
277         log_error("ERROR: Unable to allocate memory!\n");
278         return -1;
279     }
280     void *startValue = (typeSize == 4) ? (void *)&testFns.mIntStartValue
281                                        : (void *)&testFns.mLongStartValue;
282     for (size_t i = 0; i < numDestItems; i++)
283         memcpy(destItems + i * typeSize, startValue, typeSize);
284 
285     streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
286                                 typeSize * numDestItems, destItems, NULL);
287     if (!streams[0])
288     {
289         log_error("ERROR: Creating output array failed!\n");
290         return -1;
291     }
292     streams[1] = clCreateBuffer(
293         context,
294         ((startRefValues != NULL ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
295         typeSize * threadSize, startRefValues, NULL);
296     if (!streams[1])
297     {
298         log_error("ERROR: Creating reference array failed!\n");
299         return -1;
300     }
301 
302     /* Set the arguments */
303     error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
304     test_error(error, "Unable to set indexed kernel arguments");
305     error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
306     test_error(error, "Unable to set indexed kernel arguments");
307 
308     if (isLocal)
309     {
310         error = clSetKernelArg(kernel, 2, typeSize * numDestItems, NULL);
311         test_error(error, "Unable to set indexed local kernel argument");
312 
313         cl_int numDestItemsInt = (cl_int)numDestItems;
314         error = clSetKernelArg(kernel, 3, sizeof(cl_int), &numDestItemsInt);
315         test_error(error, "Unable to set indexed kernel argument");
316     }
317 
318     /* Run the kernel */
319     threads[0] = threadSize;
320     error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, &groupSize,
321                                    0, NULL, NULL);
322     test_error(error, "Unable to execute test kernel");
323 
324     error =
325         clEnqueueReadBuffer(queue, streams[0], true, 0, typeSize * numDestItems,
326                             destItems, 0, NULL, NULL);
327     test_error(error, "Unable to read result value!");
328 
329     error =
330         clEnqueueReadBuffer(queue, streams[1], true, 0, typeSize * threadSize,
331                             refValues, 0, NULL, NULL);
332     test_error(error, "Unable to read reference values!");
333 
334     // If we have an expectedFn, then we need to generate a final value to
335     // compare against. If we don't have one, it's because we're comparing ref
336     // values only
337     if (testFns.ExpectedValueIntFn != NULL)
338     {
339         for (size_t i = 0; i < numDestItems; i++)
340         {
341             char expected[8];
342             cl_int intVal;
343             cl_long longVal;
344             if (typeSize == 4)
345             {
346                 // Int version
347                 intVal = testFns.ExpectedValueIntFn(
348                     threadSize, (cl_int *)startRefValues, i);
349                 memcpy(expected, &intVal, sizeof(intVal));
350             }
351             else
352             {
353                 // Long version
354                 longVal = testFns.ExpectedValueLongFn(
355                     threadSize, (cl_long *)startRefValues, i);
356                 memcpy(expected, &longVal, sizeof(longVal));
357             }
358 
359             if (memcmp(expected, destItems + i * typeSize, typeSize) != 0)
360             {
361                 if (typeSize == 4)
362                 {
363                     cl_int *outValue = (cl_int *)(destItems + i * typeSize);
364                     log_error("ERROR: Result %zu from kernel does not "
365                               "validate! (should be %d, was %d)\n",
366                               i, intVal, *outValue);
367                     cl_int *startRefs = (cl_int *)startRefValues;
368                     cl_int *refs = (cl_int *)refValues;
369                     for (i = 0; i < threadSize; i++)
370                     {
371                         if (startRefs != NULL)
372                             log_info(" --- %zu - %d --- %d\n", i, startRefs[i],
373                                      refs[i]);
374                         else
375                             log_info(" --- %zu --- %d\n", i, refs[i]);
376                     }
377                 }
378                 else
379                 {
380                     cl_long *outValue = (cl_long *)(destItems + i * typeSize);
381                     log_error("ERROR: Result %zu from kernel does not "
382                               "validate! (should be %" PRId64 ", was %" PRId64
383                               ")\n",
384                               i, longVal, *outValue);
385                     cl_long *startRefs = (cl_long *)startRefValues;
386                     cl_long *refs = (cl_long *)refValues;
387                     for (i = 0; i < threadSize; i++)
388                     {
389                         if (startRefs != NULL)
390                             log_info(" --- %zu - %" PRId64 " --- %" PRId64 "\n",
391                                      i, startRefs[i], refs[i]);
392                         else
393                             log_info(" --- %zu --- %" PRId64 "\n", i, refs[i]);
394                     }
395                 }
396                 return -1;
397             }
398         }
399     }
400 
401     if (testFns.VerifyRefsIntFn != NULL)
402     {
403         /* Use the verify function to also check the results */
404         if (dataType == kFloat)
405         {
406             cl_float *outValue = (cl_float *)destItems;
407             if (!testFns.VerifyRefsFloatFn(threadSize, (cl_float *)refValues,
408                                            *outValue)
409                 != 0)
410             {
411                 log_error("ERROR: Reference values did not validate!\n");
412                 return -1;
413             }
414         }
415         else if (typeSize == 4)
416         {
417             cl_int *outValue = (cl_int *)destItems;
418             if (!testFns.VerifyRefsIntFn(threadSize, (cl_int *)refValues,
419                                          *outValue)
420                 != 0)
421             {
422                 log_error("ERROR: Reference values did not validate!\n");
423                 return -1;
424             }
425         }
426         else
427         {
428             cl_long *outValue = (cl_long *)destItems;
429             if (!testFns.VerifyRefsLongFn(threadSize, (cl_long *)refValues,
430                                           *outValue)
431                 != 0)
432             {
433                 log_error("ERROR: Reference values did not validate!\n");
434                 return -1;
435             }
436         }
437     }
438     else if (testFns.ExpectedValueIntFn == NULL)
439     {
440         log_error("ERROR: Test doesn't check total or refs; no values are "
441                   "verified!\n");
442         return -1;
443     }
444 
445 
446     /* Re-write the starting value */
447     for (size_t i = 0; i < numDestItems; i++)
448         memcpy(destItems + i * typeSize, startValue, typeSize);
449     error =
450         clEnqueueWriteBuffer(queue, streams[0], true, 0,
451                              typeSize * numDestItems, destItems, 0, NULL, NULL);
452     test_error(error, "Unable to write starting values!");
453 
454     /* Run the kernel once for a single thread, so we can verify that the
455      * returned value is the original one */
456     threads[0] = 1;
457     error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads, threads, 0,
458                                    NULL, NULL);
459     test_error(error, "Unable to execute test kernel");
460 
461     error = clEnqueueReadBuffer(queue, streams[1], true, 0, typeSize, refValues,
462                                 0, NULL, NULL);
463     test_error(error, "Unable to read reference values!");
464 
465     if (memcmp(refValues, destItems, typeSize) != 0)
466     {
467         if (typeSize == 4)
468         {
469             cl_int *s = (cl_int *)destItems;
470             cl_int *r = (cl_int *)refValues;
471             log_error("ERROR: atomic function operated correctly but did NOT "
472                       "return correct 'old' value "
473                       " (should have been %d, returned %d)!\n",
474                       *s, *r);
475         }
476         else
477         {
478             cl_long *s = (cl_long *)destItems;
479             cl_long *r = (cl_long *)refValues;
480             log_error("ERROR: atomic function operated correctly but did NOT "
481                       "return correct 'old' value "
482                       " (should have been %" PRId64 ", returned %" PRId64
483                       ")!\n",
484                       *s, *r);
485         }
486         return -1;
487     }
488 
489     delete[] destItems;
490     free(refValues);
491     if (startRefValues != NULL) free(startRefValues);
492 
493     return 0;
494 }
495 
test_atomic_function_set(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements,const char * programCore,TestFns testFns,bool extended,bool matchGroupSize,bool usingAtomicPrefix)496 int test_atomic_function_set(cl_device_id deviceID, cl_context context,
497                              cl_command_queue queue, int num_elements,
498                              const char *programCore, TestFns testFns,
499                              bool extended, bool matchGroupSize,
500                              bool usingAtomicPrefix)
501 {
502     log_info("    Testing %s functions...\n",
503              usingAtomicPrefix ? "atomic_" : "atom_");
504 
505     int errors = 0;
506     errors |= test_atomic_function(deviceID, context, queue, num_elements,
507                                    programCore, testFns, extended, false, kInt,
508                                    matchGroupSize);
509     errors |= test_atomic_function(deviceID, context, queue, num_elements,
510                                    programCore, testFns, extended, false, kUInt,
511                                    matchGroupSize);
512     errors |= test_atomic_function(deviceID, context, queue, num_elements,
513                                    programCore, testFns, extended, true, kInt,
514                                    matchGroupSize);
515     errors |= test_atomic_function(deviceID, context, queue, num_elements,
516                                    programCore, testFns, extended, true, kUInt,
517                                    matchGroupSize);
518 
519     // Only the 32 bit atomic functions use the "atomic" prefix in 1.1, the 64
520     // bit functions still use the "atom" prefix. The argument usingAtomicPrefix
521     // is set to true if programCore was generated with the "atomic" prefix.
522     if (!usingAtomicPrefix)
523     {
524         errors |= test_atomic_function(deviceID, context, queue, num_elements,
525                                        programCore, testFns, extended, false,
526                                        kLong, matchGroupSize);
527         errors |= test_atomic_function(deviceID, context, queue, num_elements,
528                                        programCore, testFns, extended, false,
529                                        kULong, matchGroupSize);
530         errors |= test_atomic_function(deviceID, context, queue, num_elements,
531                                        programCore, testFns, extended, true,
532                                        kLong, matchGroupSize);
533         errors |= test_atomic_function(deviceID, context, queue, num_elements,
534                                        programCore, testFns, extended, true,
535                                        kULong, matchGroupSize);
536     }
537 
538     return errors;
539 }
540 
541 #pragma mark ---- add
542 
543 const char atom_add_core[] =
544     "    oldValues[tid] = atom_add( &destMemory[0], tid + 3 );\n"
545     "    atom_add( &destMemory[0], tid + 3 );\n"
546     "    atom_add( &destMemory[0], tid + 3 );\n"
547     "    atom_add( &destMemory[0], tid + 3 );\n";
548 
549 const char atomic_add_core[] =
550     "    oldValues[tid] = atomic_add( &destMemory[0], tid + 3 );\n"
551     "    atomic_add( &destMemory[0], tid + 3 );\n"
552     "    atomic_add( &destMemory[0], tid + 3 );\n"
553     "    atomic_add( &destMemory[0], tid + 3 );\n";
554 
test_atomic_add_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)555 cl_int test_atomic_add_result_int(size_t size, cl_int *startRefValues,
556                                   size_t whichDestValue)
557 {
558     cl_int total = 0;
559     for (size_t i = 0; i < size; i++) total += ((cl_int)i + 3) * 4;
560     return total;
561 }
562 
test_atomic_add_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)563 cl_long test_atomic_add_result_long(size_t size, cl_long *startRefValues,
564                                     size_t whichDestValue)
565 {
566     cl_long total = 0;
567     for (size_t i = 0; i < size; i++) total += ((i + 3) * 4);
568     return total;
569 }
570 
test_atomic_add(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)571 int test_atomic_add(cl_device_id deviceID, cl_context context,
572                     cl_command_queue queue, int num_elements)
573 {
574     TestFns set = { 0,
575                     0LL,
576                     NULL,
577                     test_atomic_add_result_int,
578                     NULL,
579                     NULL,
580                     test_atomic_add_result_long,
581                     NULL,
582                     NULL };
583 
584     if (test_atomic_function_set(
585             deviceID, context, queue, num_elements, atom_add_core, set, false,
586             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
587         != 0)
588         return -1;
589     if (test_atomic_function_set(
590             deviceID, context, queue, num_elements, atomic_add_core, set, false,
591             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
592         != 0)
593         return -1;
594     return 0;
595 }
596 
597 #pragma mark ---- sub
598 
599 const char atom_sub_core[] =
600     "    oldValues[tid] = atom_sub( &destMemory[0], tid + 3 );\n";
601 
602 const char atomic_sub_core[] =
603     "    oldValues[tid] = atomic_sub( &destMemory[0], tid + 3 );\n";
604 
test_atomic_sub_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)605 cl_int test_atomic_sub_result_int(size_t size, cl_int *startRefValues,
606                                   size_t whichDestValue)
607 {
608     cl_int total = INT_TEST_VALUE;
609     for (size_t i = 0; i < size; i++) total -= (cl_int)i + 3;
610     return total;
611 }
612 
test_atomic_sub_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)613 cl_long test_atomic_sub_result_long(size_t size, cl_long *startRefValues,
614                                     size_t whichDestValue)
615 {
616     cl_long total = LONG_TEST_VALUE;
617     for (size_t i = 0; i < size; i++) total -= i + 3;
618     return total;
619 }
620 
test_atomic_sub(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)621 int test_atomic_sub(cl_device_id deviceID, cl_context context,
622                     cl_command_queue queue, int num_elements)
623 {
624     TestFns set = { INT_TEST_VALUE,
625                     LONG_TEST_VALUE,
626                     NULL,
627                     test_atomic_sub_result_int,
628                     NULL,
629                     NULL,
630                     test_atomic_sub_result_long,
631                     NULL,
632                     NULL };
633 
634     if (test_atomic_function_set(
635             deviceID, context, queue, num_elements, atom_sub_core, set, false,
636             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
637         != 0)
638         return -1;
639     if (test_atomic_function_set(
640             deviceID, context, queue, num_elements, atomic_sub_core, set, false,
641             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
642         != 0)
643         return -1;
644     return 0;
645 }
646 
647 #pragma mark ---- xchg
648 
649 const char atom_xchg_core[] =
650     "    oldValues[tid] = atom_xchg( &destMemory[0], tid );\n";
651 
652 const char atomic_xchg_core[] =
653     "    oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
654 const char atomic_xchg_float_core[] =
655     "    oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
656 
test_atomic_xchg_verify_int(size_t size,cl_int * refValues,cl_int finalValue)657 bool test_atomic_xchg_verify_int(size_t size, cl_int *refValues,
658                                  cl_int finalValue)
659 {
660     /* For xchg, each value from 0 to size - 1 should have an entry in the ref
661      * array, and ONLY one entry */
662     char *valids;
663     size_t i;
664     char originalValidCount = 0;
665 
666     valids = (char *)malloc(sizeof(char) * size);
667     memset(valids, 0, sizeof(char) * size);
668 
669     for (i = 0; i < size; i++)
670     {
671         if (refValues[i] == INT_TEST_VALUE)
672         {
673             // Special initial value
674             originalValidCount++;
675             continue;
676         }
677         if (refValues[i] < 0 || (size_t)refValues[i] >= size)
678         {
679             log_error(
680                 "ERROR: Reference value %zu outside of valid range! (%d)\n", i,
681                 refValues[i]);
682             return false;
683         }
684         valids[refValues[i]]++;
685     }
686 
687     /* Note: ONE entry will have zero count. It'll be the last one that
688      executed, because that value should be the final value outputted */
689     if (valids[finalValue] > 0)
690     {
691         log_error("ERROR: Final value %d was also in ref list!\n", finalValue);
692         return false;
693     }
694     else
695         valids[finalValue] = 1; // So the following loop will be okay
696 
697     /* Now check that every entry has one and only one count */
698     if (originalValidCount != 1)
699     {
700         log_error("ERROR: Starting reference value %d did not occur "
701                   "once-and-only-once (occurred %d)\n",
702                   65191, originalValidCount);
703         return false;
704     }
705     for (i = 0; i < size; i++)
706     {
707         if (valids[i] != 1)
708         {
709             log_error("ERROR: Reference value %zu did not occur "
710                       "once-and-only-once (occurred %d)\n",
711                       i, valids[i]);
712             for (size_t j = 0; j < size; j++)
713                 log_info("%d: %d\n", (int)j, (int)valids[j]);
714             return false;
715         }
716     }
717 
718     free(valids);
719     return true;
720 }
721 
test_atomic_xchg_verify_long(size_t size,cl_long * refValues,cl_long finalValue)722 bool test_atomic_xchg_verify_long(size_t size, cl_long *refValues,
723                                   cl_long finalValue)
724 {
725     /* For xchg, each value from 0 to size - 1 should have an entry in the ref
726      * array, and ONLY one entry */
727     char *valids;
728     size_t i;
729     char originalValidCount = 0;
730 
731     valids = (char *)malloc(sizeof(char) * size);
732     memset(valids, 0, sizeof(char) * size);
733 
734     for (i = 0; i < size; i++)
735     {
736         if (refValues[i] == LONG_TEST_VALUE)
737         {
738             // Special initial value
739             originalValidCount++;
740             continue;
741         }
742         if (refValues[i] < 0 || (size_t)refValues[i] >= size)
743         {
744             log_error(
745                 "ERROR: Reference value %zu outside of valid range! (%" PRId64
746                 ")\n",
747                 i, refValues[i]);
748             return false;
749         }
750         valids[refValues[i]]++;
751     }
752 
753     /* Note: ONE entry will have zero count. It'll be the last one that
754      executed, because that value should be the final value outputted */
755     if (valids[finalValue] > 0)
756     {
757         log_error("ERROR: Final value %" PRId64 " was also in ref list!\n",
758                   finalValue);
759         return false;
760     }
761     else
762         valids[finalValue] = 1; // So the following loop will be okay
763 
764     /* Now check that every entry has one and only one count */
765     if (originalValidCount != 1)
766     {
767         log_error("ERROR: Starting reference value %d did not occur "
768                   "once-and-only-once (occurred %d)\n",
769                   65191, originalValidCount);
770         return false;
771     }
772     for (i = 0; i < size; i++)
773     {
774         if (valids[i] != 1)
775         {
776             log_error("ERROR: Reference value %zu did not occur "
777                       "once-and-only-once (occurred %d)\n",
778                       i, valids[i]);
779             for (size_t j = 0; j < size; j++)
780                 log_info("%d: %d\n", (int)j, (int)valids[j]);
781             return false;
782         }
783     }
784 
785     free(valids);
786     return true;
787 }
788 
test_atomic_xchg_verify_float(size_t size,cl_float * refValues,cl_float finalValue)789 bool test_atomic_xchg_verify_float(size_t size, cl_float *refValues,
790                                    cl_float finalValue)
791 {
792     /* For xchg, each value from 0 to size - 1 should have an entry in the ref
793      * array, and ONLY one entry */
794     char *valids;
795     size_t i;
796     char originalValidCount = 0;
797 
798     valids = (char *)malloc(sizeof(char) * size);
799     memset(valids, 0, sizeof(char) * size);
800 
801     for (i = 0; i < size; i++)
802     {
803         cl_int *intRefValue = (cl_int *)(&refValues[i]);
804         if (*intRefValue == INT_TEST_VALUE)
805         {
806             // Special initial value
807             originalValidCount++;
808             continue;
809         }
810         if (refValues[i] < 0 || (size_t)refValues[i] >= size)
811         {
812             log_error(
813                 "ERROR: Reference value %zu outside of valid range! (%a)\n", i,
814                 refValues[i]);
815             return false;
816         }
817         valids[(int)refValues[i]]++;
818     }
819 
820     /* Note: ONE entry will have zero count. It'll be the last one that
821      executed, because that value should be the final value outputted */
822     if (valids[(int)finalValue] > 0)
823     {
824         log_error("ERROR: Final value %a was also in ref list!\n", finalValue);
825         return false;
826     }
827     else
828         valids[(int)finalValue] = 1; // So the following loop will be okay
829 
830     /* Now check that every entry has one and only one count */
831     if (originalValidCount != 1)
832     {
833         log_error("ERROR: Starting reference value %d did not occur "
834                   "once-and-only-once (occurred %d)\n",
835                   65191, originalValidCount);
836         return false;
837     }
838     for (i = 0; i < size; i++)
839     {
840         if (valids[i] != 1)
841         {
842             log_error("ERROR: Reference value %zu did not occur "
843                       "once-and-only-once (occurred %d)\n",
844                       i, valids[i]);
845             for (size_t j = 0; j < size; j++)
846                 log_info("%d: %d\n", (int)j, (int)valids[j]);
847             return false;
848         }
849     }
850 
851     free(valids);
852     return true;
853 }
854 
test_atomic_xchg(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)855 int test_atomic_xchg(cl_device_id deviceID, cl_context context,
856                      cl_command_queue queue, int num_elements)
857 {
858     TestFns set = { INT_TEST_VALUE,
859                     LONG_TEST_VALUE,
860                     NULL,
861                     NULL,
862                     NULL,
863                     test_atomic_xchg_verify_int,
864                     NULL,
865                     NULL,
866                     test_atomic_xchg_verify_long,
867                     NULL,
868                     NULL,
869                     test_atomic_xchg_verify_float };
870 
871     int errors = test_atomic_function_set(
872         deviceID, context, queue, num_elements, atom_xchg_core, set, false,
873         true, /*usingAtomicPrefix*/ false);
874     errors |= test_atomic_function_set(deviceID, context, queue, num_elements,
875                                        atomic_xchg_core, set, false, true,
876                                        /*usingAtomicPrefix*/ true);
877 
878     errors |= test_atomic_function(deviceID, context, queue, num_elements,
879                                    atomic_xchg_float_core, set, false, false,
880                                    kFloat, true);
881     errors |= test_atomic_function(deviceID, context, queue, num_elements,
882                                    atomic_xchg_float_core, set, false, true,
883                                    kFloat, true);
884 
885     return errors;
886 }
887 
888 
889 #pragma mark ---- min
890 
891 const char atom_min_core[] =
892     "    oldValues[tid] = atom_min( &destMemory[0], oldValues[tid] );\n";
893 
894 const char atomic_min_core[] =
895     "    oldValues[tid] = atomic_min( &destMemory[0], oldValues[tid] );\n";
896 
test_atomic_min_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)897 cl_int test_atomic_min_result_int(size_t size, cl_int *startRefValues,
898                                   size_t whichDestValue)
899 {
900     cl_int total = 0x7fffffffL;
901     for (size_t i = 0; i < size; i++)
902     {
903         if (startRefValues[i] < total) total = startRefValues[i];
904     }
905     return total;
906 }
907 
test_atomic_min_gen_int(size_t size,cl_int * startRefValues,MTdata d)908 void test_atomic_min_gen_int(size_t size, cl_int *startRefValues, MTdata d)
909 {
910     for (size_t i = 0; i < size; i++)
911         startRefValues[i] =
912             (cl_int)(genrand_int32(d) % 0x3fffffff) + 0x3fffffff;
913 }
914 
test_atomic_min_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)915 cl_long test_atomic_min_result_long(size_t size, cl_long *startRefValues,
916                                     size_t whichDestValue)
917 {
918     cl_long total = 0x7fffffffffffffffLL;
919     for (size_t i = 0; i < size; i++)
920     {
921         if (startRefValues[i] < total) total = startRefValues[i];
922     }
923     return total;
924 }
925 
test_atomic_min_gen_long(size_t size,cl_long * startRefValues,MTdata d)926 void test_atomic_min_gen_long(size_t size, cl_long *startRefValues, MTdata d)
927 {
928     for (size_t i = 0; i < size; i++)
929         startRefValues[i] =
930             (cl_long)(genrand_int32(d)
931                       | (((cl_long)genrand_int32(d) & 0x7fffffffL) << 16));
932 }
933 
test_atomic_min(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)934 int test_atomic_min(cl_device_id deviceID, cl_context context,
935                     cl_command_queue queue, int num_elements)
936 {
937     TestFns set = { 0x7fffffffL,
938                     0x7fffffffffffffffLL,
939                     NULL,
940                     test_atomic_min_result_int,
941                     test_atomic_min_gen_int,
942                     NULL,
943                     test_atomic_min_result_long,
944                     test_atomic_min_gen_long,
945                     NULL };
946 
947     if (test_atomic_function_set(
948             deviceID, context, queue, num_elements, atom_min_core, set, true,
949             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
950         != 0)
951         return -1;
952     if (test_atomic_function_set(
953             deviceID, context, queue, num_elements, atomic_min_core, set, true,
954             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
955         != 0)
956         return -1;
957     return 0;
958 }
959 
960 
961 #pragma mark ---- max
962 
963 const char atom_max_core[] =
964     "    oldValues[tid] = atom_max( &destMemory[0], oldValues[tid] );\n";
965 
966 const char atomic_max_core[] =
967     "    oldValues[tid] = atomic_max( &destMemory[0], oldValues[tid] );\n";
968 
test_atomic_max_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)969 cl_int test_atomic_max_result_int(size_t size, cl_int *startRefValues,
970                                   size_t whichDestValue)
971 {
972     cl_int total = 0;
973     for (size_t i = 0; i < size; i++)
974     {
975         if (startRefValues[i] > total) total = startRefValues[i];
976     }
977     return total;
978 }
979 
test_atomic_max_gen_int(size_t size,cl_int * startRefValues,MTdata d)980 void test_atomic_max_gen_int(size_t size, cl_int *startRefValues, MTdata d)
981 {
982     for (size_t i = 0; i < size; i++)
983         startRefValues[i] =
984             (cl_int)(genrand_int32(d) % 0x3fffffff) + 0x3fffffff;
985 }
986 
test_atomic_max_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)987 cl_long test_atomic_max_result_long(size_t size, cl_long *startRefValues,
988                                     size_t whichDestValue)
989 {
990     cl_long total = 0;
991     for (size_t i = 0; i < size; i++)
992     {
993         if (startRefValues[i] > total) total = startRefValues[i];
994     }
995     return total;
996 }
997 
test_atomic_max_gen_long(size_t size,cl_long * startRefValues,MTdata d)998 void test_atomic_max_gen_long(size_t size, cl_long *startRefValues, MTdata d)
999 {
1000     for (size_t i = 0; i < size; i++)
1001         startRefValues[i] =
1002             (cl_long)(genrand_int32(d)
1003                       | (((cl_long)genrand_int32(d) & 0x7fffffffL) << 16));
1004 }
1005 
test_atomic_max(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1006 int test_atomic_max(cl_device_id deviceID, cl_context context,
1007                     cl_command_queue queue, int num_elements)
1008 {
1009     TestFns set = { 0,
1010                     0,
1011                     NULL,
1012                     test_atomic_max_result_int,
1013                     test_atomic_max_gen_int,
1014                     NULL,
1015                     test_atomic_max_result_long,
1016                     test_atomic_max_gen_long,
1017                     NULL };
1018 
1019     if (test_atomic_function_set(
1020             deviceID, context, queue, num_elements, atom_max_core, set, true,
1021             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
1022         != 0)
1023         return -1;
1024     if (test_atomic_function_set(
1025             deviceID, context, queue, num_elements, atomic_max_core, set, true,
1026             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
1027         != 0)
1028         return -1;
1029     return 0;
1030 }
1031 
1032 
1033 #pragma mark ---- inc
1034 
1035 const char atom_inc_core[] =
1036     "    oldValues[tid] = atom_inc( &destMemory[0] );\n";
1037 
1038 const char atomic_inc_core[] =
1039     "    oldValues[tid] = atomic_inc( &destMemory[0] );\n";
1040 
test_atomic_inc_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)1041 cl_int test_atomic_inc_result_int(size_t size, cl_int *startRefValues,
1042                                   size_t whichDestValue)
1043 {
1044     return INT_TEST_VALUE + (cl_int)size;
1045 }
1046 
test_atomic_inc_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)1047 cl_long test_atomic_inc_result_long(size_t size, cl_long *startRefValues,
1048                                     size_t whichDestValue)
1049 {
1050     return LONG_TEST_VALUE + size;
1051 }
1052 
test_atomic_inc(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1053 int test_atomic_inc(cl_device_id deviceID, cl_context context,
1054                     cl_command_queue queue, int num_elements)
1055 {
1056     TestFns set = { INT_TEST_VALUE,
1057                     LONG_TEST_VALUE,
1058                     NULL,
1059                     test_atomic_inc_result_int,
1060                     NULL,
1061                     NULL,
1062                     test_atomic_inc_result_long,
1063                     NULL,
1064                     NULL };
1065 
1066     if (test_atomic_function_set(
1067             deviceID, context, queue, num_elements, atom_inc_core, set, false,
1068             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
1069         != 0)
1070         return -1;
1071     if (test_atomic_function_set(
1072             deviceID, context, queue, num_elements, atomic_inc_core, set, false,
1073             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
1074         != 0)
1075         return -1;
1076     return 0;
1077 }
1078 
1079 
1080 #pragma mark ---- dec
1081 
1082 const char atom_dec_core[] =
1083     "    oldValues[tid] = atom_dec( &destMemory[0] );\n";
1084 
1085 const char atomic_dec_core[] =
1086     "    oldValues[tid] = atomic_dec( &destMemory[0] );\n";
1087 
test_atomic_dec_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)1088 cl_int test_atomic_dec_result_int(size_t size, cl_int *startRefValues,
1089                                   size_t whichDestValue)
1090 {
1091     return INT_TEST_VALUE - (cl_int)size;
1092 }
1093 
test_atomic_dec_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)1094 cl_long test_atomic_dec_result_long(size_t size, cl_long *startRefValues,
1095                                     size_t whichDestValue)
1096 {
1097     return LONG_TEST_VALUE - size;
1098 }
1099 
test_atomic_dec(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1100 int test_atomic_dec(cl_device_id deviceID, cl_context context,
1101                     cl_command_queue queue, int num_elements)
1102 {
1103     TestFns set = { INT_TEST_VALUE,
1104                     LONG_TEST_VALUE,
1105                     NULL,
1106                     test_atomic_dec_result_int,
1107                     NULL,
1108                     NULL,
1109                     test_atomic_dec_result_long,
1110                     NULL,
1111                     NULL };
1112 
1113     if (test_atomic_function_set(
1114             deviceID, context, queue, num_elements, atom_dec_core, set, false,
1115             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
1116         != 0)
1117         return -1;
1118     if (test_atomic_function_set(
1119             deviceID, context, queue, num_elements, atomic_dec_core, set, false,
1120             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
1121         != 0)
1122         return -1;
1123     return 0;
1124 }
1125 
1126 
1127 #pragma mark ---- cmpxchg
1128 
1129 /* We test cmpxchg by implementing (the long way) atom_add */
1130 // clang-format off
1131 const char atom_cmpxchg_core[] =
1132     "    int oldValue, origValue, newValue;\n"
1133     "    do { \n"
1134     "        origValue = destMemory[0];\n"
1135     "        newValue = origValue + tid + 2;\n"
1136     "        oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
1137     "    } while( oldValue != origValue );\n"
1138     "    oldValues[tid] = oldValue;\n";
1139 
1140 const char atom_cmpxchg64_core[] =
1141     "    long oldValue, origValue, newValue;\n"
1142     "    do { \n"
1143     "        origValue = destMemory[0];\n"
1144     "        newValue = origValue + tid + 2;\n"
1145     "        oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
1146     "    } while( oldValue != origValue );\n"
1147     "    oldValues[tid] = oldValue;\n";
1148 
1149 const char atomic_cmpxchg_core[] =
1150     "    int oldValue, origValue, newValue;\n"
1151     "    do { \n"
1152     "        origValue = destMemory[0];\n"
1153     "        newValue = origValue + tid + 2;\n"
1154     "        oldValue = atomic_cmpxchg( &destMemory[0], origValue, newValue );\n"
1155     "    } while( oldValue != origValue );\n"
1156     "    oldValues[tid] = oldValue;\n";
1157 // clang-format on
1158 
test_atomic_cmpxchg_result_int(size_t size,cl_int * startRefValues,size_t whichDestValue)1159 cl_int test_atomic_cmpxchg_result_int(size_t size, cl_int *startRefValues,
1160                                       size_t whichDestValue)
1161 {
1162     cl_int total = INT_TEST_VALUE;
1163     for (size_t i = 0; i < size; i++) total += (cl_int)i + 2;
1164     return total;
1165 }
1166 
test_atomic_cmpxchg_result_long(size_t size,cl_long * startRefValues,size_t whichDestValue)1167 cl_long test_atomic_cmpxchg_result_long(size_t size, cl_long *startRefValues,
1168                                         size_t whichDestValue)
1169 {
1170     cl_long total = LONG_TEST_VALUE;
1171     for (size_t i = 0; i < size; i++) total += i + 2;
1172     return total;
1173 }
1174 
test_atomic_cmpxchg(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1175 int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context,
1176                         cl_command_queue queue, int num_elements)
1177 {
1178     TestFns set = { INT_TEST_VALUE,
1179                     LONG_TEST_VALUE,
1180                     NULL,
1181                     test_atomic_cmpxchg_result_int,
1182                     NULL,
1183                     NULL,
1184                     test_atomic_cmpxchg_result_long,
1185                     NULL,
1186                     NULL };
1187 
1188     int errors = 0;
1189 
1190     log_info("    Testing atom_ functions...\n");
1191     errors |=
1192         test_atomic_function(deviceID, context, queue, num_elements,
1193                              atom_cmpxchg_core, set, false, false, kInt, true);
1194     errors |=
1195         test_atomic_function(deviceID, context, queue, num_elements,
1196                              atom_cmpxchg_core, set, false, false, kUInt, true);
1197     errors |=
1198         test_atomic_function(deviceID, context, queue, num_elements,
1199                              atom_cmpxchg_core, set, false, true, kInt, true);
1200     errors |=
1201         test_atomic_function(deviceID, context, queue, num_elements,
1202                              atom_cmpxchg_core, set, false, true, kUInt, true);
1203 
1204     errors |= test_atomic_function(deviceID, context, queue, num_elements,
1205                                    atom_cmpxchg64_core, set, false, false,
1206                                    kLong, true);
1207     errors |= test_atomic_function(deviceID, context, queue, num_elements,
1208                                    atom_cmpxchg64_core, set, false, false,
1209                                    kULong, true);
1210     errors |= test_atomic_function(deviceID, context, queue, num_elements,
1211                                    atom_cmpxchg64_core, set, false, true, kLong,
1212                                    true);
1213     errors |= test_atomic_function(deviceID, context, queue, num_elements,
1214                                    atom_cmpxchg64_core, set, false, true,
1215                                    kULong, true);
1216 
1217     log_info("    Testing atomic_ functions...\n");
1218     errors |= test_atomic_function(deviceID, context, queue, num_elements,
1219                                    atomic_cmpxchg_core, set, false, false, kInt,
1220                                    true);
1221     errors |= test_atomic_function(deviceID, context, queue, num_elements,
1222                                    atomic_cmpxchg_core, set, false, false,
1223                                    kUInt, true);
1224     errors |=
1225         test_atomic_function(deviceID, context, queue, num_elements,
1226                              atomic_cmpxchg_core, set, false, true, kInt, true);
1227     errors |= test_atomic_function(deviceID, context, queue, num_elements,
1228                                    atomic_cmpxchg_core, set, false, true, kUInt,
1229                                    true);
1230 
1231     if (errors) return -1;
1232 
1233     return 0;
1234 }
1235 
1236 #pragma mark -------- Bitwise functions
1237 
test_bitwise_num_results(size_t threadCount,ExplicitType dataType)1238 size_t test_bitwise_num_results(size_t threadCount, ExplicitType dataType)
1239 {
1240     size_t numBits = get_explicit_type_size(dataType) * 8;
1241 
1242     return (threadCount + numBits - 1) / numBits;
1243 }
1244 
1245 #pragma mark ---- and
1246 
1247 // clang-format off
1248 const char atom_and_core[] =
1249     "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1250     "    int  whichResult = tid / numBits;\n"
1251     "    int  bitIndex = tid - ( whichResult * numBits );\n"
1252     "\n"
1253     "    oldValues[tid] = atom_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n";
1254 
1255 const char atomic_and_core[] =
1256     "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1257     "    int  whichResult = tid / numBits;\n"
1258     "    int  bitIndex = tid - ( whichResult * numBits );\n"
1259     "\n"
1260     "    oldValues[tid] = atomic_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n";
1261 // clang-format on
1262 
1263 
test_atomic_and_result_int(size_t size,cl_int * startRefValues,size_t whichResult)1264 cl_int test_atomic_and_result_int(size_t size, cl_int *startRefValues,
1265                                   size_t whichResult)
1266 {
1267     size_t numThreads = ((size_t)size + 31) / 32;
1268     if (whichResult < numThreads - 1) return 0;
1269 
1270     // Last item doesn't get and'ed on every bit, so we have to mask away
1271     size_t numBits = (size_t)size - whichResult * 32;
1272     cl_int bits = (cl_int)0xffffffffL;
1273     for (size_t i = 0; i < numBits; i++) bits &= ~(1 << i);
1274 
1275     return bits;
1276 }
1277 
test_atomic_and_result_long(size_t size,cl_long * startRefValues,size_t whichResult)1278 cl_long test_atomic_and_result_long(size_t size, cl_long *startRefValues,
1279                                     size_t whichResult)
1280 {
1281     size_t numThreads = ((size_t)size + 63) / 64;
1282     if (whichResult < numThreads - 1) return 0;
1283 
1284     // Last item doesn't get and'ed on every bit, so we have to mask away
1285     size_t numBits = (size_t)size - whichResult * 64;
1286     cl_long bits = (cl_long)0xffffffffffffffffLL;
1287     for (size_t i = 0; i < numBits; i++) bits &= ~(1LL << i);
1288 
1289     return bits;
1290 }
1291 
test_atomic_and(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1292 int test_atomic_and(cl_device_id deviceID, cl_context context,
1293                     cl_command_queue queue, int num_elements)
1294 {
1295     TestFns set = { 0xffffffff,
1296                     0xffffffffffffffffLL,
1297                     test_bitwise_num_results,
1298                     test_atomic_and_result_int,
1299                     NULL,
1300                     NULL,
1301                     test_atomic_and_result_long,
1302                     NULL,
1303                     NULL };
1304 
1305     if (test_atomic_function_set(
1306             deviceID, context, queue, num_elements, atom_and_core, set, true,
1307             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
1308         != 0)
1309         return -1;
1310     if (test_atomic_function_set(
1311             deviceID, context, queue, num_elements, atomic_and_core, set, true,
1312             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
1313         != 0)
1314         return -1;
1315     return 0;
1316 }
1317 
1318 
1319 #pragma mark ---- or
1320 
1321 // clang-format off
1322 const char atom_or_core[] =
1323     "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1324     "    int  whichResult = tid / numBits;\n"
1325     "    int  bitIndex = tid - ( whichResult * numBits );\n"
1326     "\n"
1327     "    oldValues[tid] = atom_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n";
1328 
1329 const char atomic_or_core[] =
1330     "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1331     "    int  whichResult = tid / numBits;\n"
1332     "    int  bitIndex = tid - ( whichResult * numBits );\n"
1333     "\n"
1334     "    oldValues[tid] = atomic_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n";
1335 // clang-format on
1336 
test_atomic_or_result_int(size_t size,cl_int * startRefValues,size_t whichResult)1337 cl_int test_atomic_or_result_int(size_t size, cl_int *startRefValues,
1338                                  size_t whichResult)
1339 {
1340     size_t numThreads = ((size_t)size + 31) / 32;
1341     if (whichResult < numThreads - 1) return 0xffffffff;
1342 
1343     // Last item doesn't get and'ed on every bit, so we have to mask away
1344     size_t numBits = (size_t)size - whichResult * 32;
1345     cl_int bits = 0;
1346     for (size_t i = 0; i < numBits; i++) bits |= (1 << i);
1347 
1348     return bits;
1349 }
1350 
test_atomic_or_result_long(size_t size,cl_long * startRefValues,size_t whichResult)1351 cl_long test_atomic_or_result_long(size_t size, cl_long *startRefValues,
1352                                    size_t whichResult)
1353 {
1354     size_t numThreads = ((size_t)size + 63) / 64;
1355     if (whichResult < numThreads - 1) return 0x0ffffffffffffffffLL;
1356 
1357     // Last item doesn't get and'ed on every bit, so we have to mask away
1358     size_t numBits = (size_t)size - whichResult * 64;
1359     cl_long bits = 0;
1360     for (size_t i = 0; i < numBits; i++) bits |= (1LL << i);
1361 
1362     return bits;
1363 }
1364 
test_atomic_or(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1365 int test_atomic_or(cl_device_id deviceID, cl_context context,
1366                    cl_command_queue queue, int num_elements)
1367 {
1368     TestFns set = {
1369         0,    0LL,  test_bitwise_num_results,   test_atomic_or_result_int,
1370         NULL, NULL, test_atomic_or_result_long, NULL,
1371         NULL
1372     };
1373 
1374     if (test_atomic_function_set(
1375             deviceID, context, queue, num_elements, atom_or_core, set, true,
1376             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
1377         != 0)
1378         return -1;
1379     if (test_atomic_function_set(
1380             deviceID, context, queue, num_elements, atomic_or_core, set, true,
1381             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
1382         != 0)
1383         return -1;
1384     return 0;
1385 }
1386 
1387 
1388 #pragma mark ---- xor
1389 
1390 const char atom_xor_core[] =
1391     "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1392     "    int  bitIndex = tid & ( numBits - 1 );\n"
1393     "\n"
1394     "    oldValues[tid] = atom_xor( &destMemory[0], 1L << bitIndex );\n";
1395 
1396 const char atomic_xor_core[] =
1397     "    size_t numBits = sizeof( destMemory[0] ) * 8;\n"
1398     "    int  bitIndex = tid & ( numBits - 1 );\n"
1399     "\n"
1400     "    oldValues[tid] = atomic_xor( &destMemory[0], 1L << bitIndex );\n";
1401 
test_atomic_xor_result_int(size_t size,cl_int * startRefValues,size_t whichResult)1402 cl_int test_atomic_xor_result_int(size_t size, cl_int *startRefValues,
1403                                   size_t whichResult)
1404 {
1405     cl_int total = 0x2f08ab41;
1406     for (size_t i = 0; i < size; i++) total ^= (1 << (i & 31));
1407     return total;
1408 }
1409 
test_atomic_xor_result_long(size_t size,cl_long * startRefValues,size_t whichResult)1410 cl_long test_atomic_xor_result_long(size_t size, cl_long *startRefValues,
1411                                     size_t whichResult)
1412 {
1413     cl_long total = 0x2f08ab418ba0541LL;
1414     for (size_t i = 0; i < size; i++) total ^= (1LL << (i & 63));
1415     return total;
1416 }
1417 
test_atomic_xor(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)1418 int test_atomic_xor(cl_device_id deviceID, cl_context context,
1419                     cl_command_queue queue, int num_elements)
1420 {
1421     TestFns set = { 0x2f08ab41,
1422                     0x2f08ab418ba0541LL,
1423                     NULL,
1424                     test_atomic_xor_result_int,
1425                     NULL,
1426                     NULL,
1427                     test_atomic_xor_result_long,
1428                     NULL,
1429                     NULL };
1430 
1431     if (test_atomic_function_set(
1432             deviceID, context, queue, num_elements, atom_xor_core, set, true,
1433             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false)
1434         != 0)
1435         return -1;
1436     if (test_atomic_function_set(
1437             deviceID, context, queue, num_elements, atomic_xor_core, set, true,
1438             /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true)
1439         != 0)
1440         return -1;
1441     return 0;
1442 }
1443