xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/c11_atomics/common.h (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #ifndef COMMON_H_
17 #define COMMON_H_
18 
19 #include "harness/testHarness.h"
20 #include "harness/typeWrappers.h"
21 #include "harness/ThreadPool.h"
22 
23 #include "host_atomics.h"
24 
25 #include <vector>
26 #include <sstream>
27 
28 #define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads)
29 #define MAX_HOST_THREADS GetThreadCount()
30 
31 #define EXECUTE_TEST(error, test)                                              \
32     error |= test;                                                             \
33     if (error && !gContinueOnError) return error;
34 
35 enum TExplicitAtomicType
36 {
37     TYPE_ATOMIC_INT,
38     TYPE_ATOMIC_UINT,
39     TYPE_ATOMIC_LONG,
40     TYPE_ATOMIC_ULONG,
41     TYPE_ATOMIC_FLOAT,
42     TYPE_ATOMIC_DOUBLE,
43     TYPE_ATOMIC_INTPTR_T,
44     TYPE_ATOMIC_UINTPTR_T,
45     TYPE_ATOMIC_SIZE_T,
46     TYPE_ATOMIC_PTRDIFF_T,
47     TYPE_ATOMIC_FLAG
48 };
49 
50 enum TExplicitMemoryScopeType
51 {
52     MEMORY_SCOPE_EMPTY,
53     MEMORY_SCOPE_WORK_GROUP,
54     MEMORY_SCOPE_DEVICE,
55     MEMORY_SCOPE_ALL_DEVICES, // Alias for MEMORY_SCOPE_ALL_SVM_DEVICES
56     MEMORY_SCOPE_ALL_SVM_DEVICES
57 };
58 
59 extern bool
60     gHost; // temporary flag for testing native host threads (test verification)
61 extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2)
62 extern bool gContinueOnError; // execute all cases even when errors detected
63 extern bool
64     gNoGlobalVariables; // disable cases with global atomics in program scope
65 extern bool gNoGenericAddressSpace; // disable cases with generic address space
66 extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree
67 extern bool gDebug; // print OpenCL kernel code
68 extern int gInternalIterations; // internal test iterations for atomic
69                                 // operation, sufficient to verify atomicity
70 extern int
71     gMaxDeviceThreads; // maximum number of threads executed on OCL device
72 extern cl_device_atomic_capabilities gAtomicMemCap,
73     gAtomicFenceCap; // atomic memory and fence capabilities for this device
74 
75 extern const char *
76 get_memory_order_type_name(TExplicitMemoryOrderType orderType);
77 extern const char *
78 get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);
79 
80 extern cl_int getSupportedMemoryOrdersAndScopes(
81     cl_device_id device, std::vector<TExplicitMemoryOrderType> &memoryOrders,
82     std::vector<TExplicitMemoryScopeType> &memoryScopes);
83 
84 class AtomicTypeInfo {
85 public:
86     TExplicitAtomicType _type;
AtomicTypeInfo(TExplicitAtomicType type)87     AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
88     cl_uint Size(cl_device_id device);
89     const char *AtomicTypeName();
90     const char *RegularTypeName();
91     const char *AddSubOperandTypeName();
92     int IsSupported(cl_device_id device);
93 };
94 
95 template <typename HostDataType>
96 class AtomicTypeExtendedInfo : public AtomicTypeInfo {
97 public:
AtomicTypeExtendedInfo(TExplicitAtomicType type)98     AtomicTypeExtendedInfo(TExplicitAtomicType type): AtomicTypeInfo(type) {}
99     HostDataType MinValue();
100     HostDataType MaxValue();
SpecialValue(cl_uchar x)101     HostDataType SpecialValue(cl_uchar x)
102     {
103         HostDataType tmp;
104         cl_uchar *ptr = (cl_uchar *)&tmp;
105         for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_uchar); i++)
106             ptr[i] = x;
107         return tmp;
108     }
SpecialValue(cl_ushort x)109     HostDataType SpecialValue(cl_ushort x)
110     {
111         HostDataType tmp;
112         cl_ushort *ptr = (cl_ushort *)&tmp;
113         for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_ushort); i++)
114             ptr[i] = x;
115         return tmp;
116     }
117 };
118 
119 class CTest {
120 public:
121     virtual int Execute(cl_device_id deviceID, cl_context context,
122                         cl_command_queue queue, int num_elements) = 0;
123 };
124 
125 template <typename HostAtomicType, typename HostDataType>
126 class CBasicTest : CTest {
127 public:
128     typedef struct
129     {
130         CBasicTest *test;
131         cl_uint tid;
132         cl_uint threadCount;
133         volatile HostAtomicType *destMemory;
134         HostDataType *oldValues;
135     } THostThreadContext;
HostThreadFunction(cl_uint job_id,cl_uint thread_id,void * userInfo)136     static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id,
137                                      void *userInfo)
138     {
139         THostThreadContext *threadContext =
140             ((THostThreadContext *)userInfo) + job_id;
141         threadContext->test->HostFunction(
142             threadContext->tid, threadContext->threadCount,
143             threadContext->destMemory, threadContext->oldValues);
144         return 0;
145     }
CBasicTest(TExplicitAtomicType dataType,bool useSVM)146     CBasicTest(TExplicitAtomicType dataType, bool useSVM)
147         : CTest(), _maxDeviceThreads(MAX_DEVICE_THREADS), _dataType(dataType),
148           _useSVM(useSVM), _startValue(255), _localMemory(false),
149           _declaredInProgram(false), _usedInFunction(false),
150           _genericAddrSpace(false), _oldValueCheck(true),
151           _localRefValues(false), _maxGroupSize(0), _passCount(0),
152           _iterations(gInternalIterations)
153     {}
~CBasicTest()154     virtual ~CBasicTest()
155     {
156         if (_passCount)
157             log_info("  %u tests executed successfully for %s\n", _passCount,
158                      DataType().AtomicTypeName());
159     }
NumResults(cl_uint threadCount,cl_device_id deviceID)160     virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
161     {
162         return 1;
163     }
NumNonAtomicVariablesPerThread()164     virtual cl_uint NumNonAtomicVariablesPerThread() { return 1; }
ExpectedValue(HostDataType & expected,cl_uint threadCount,HostDataType * startRefValues,cl_uint whichDestValue)165     virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
166                                HostDataType *startRefValues,
167                                cl_uint whichDestValue)
168     {
169         return false;
170     }
GenerateRefs(cl_uint threadCount,HostDataType * startRefValues,MTdata d)171     virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
172                               MTdata d)
173     {
174         return false;
175     }
VerifyRefs(bool & correct,cl_uint threadCount,HostDataType * refValues,HostAtomicType * finalValues)176     virtual bool VerifyRefs(bool &correct, cl_uint threadCount,
177                             HostDataType *refValues,
178                             HostAtomicType *finalValues)
179     {
180         return false;
181     }
182     virtual std::string PragmaHeader(cl_device_id deviceID);
183     virtual std::string ProgramHeader(cl_uint maxNumDestItems);
184     virtual std::string FunctionCode();
185     virtual std::string KernelCode(cl_uint maxNumDestItems);
186     virtual std::string ProgramCore() = 0;
SingleTestName()187     virtual std::string SingleTestName()
188     {
189         std::string testName = LocalMemory() ? "local" : "global";
190         testName += " ";
191         testName += DataType().AtomicTypeName();
192         if (DeclaredInProgram())
193         {
194             testName += " declared in program";
195         }
196         if (DeclaredInProgram() && UsedInFunction()) testName += ",";
197         if (UsedInFunction())
198         {
199             testName += " used in ";
200             if (GenericAddrSpace()) testName += "generic ";
201             testName += "function";
202         }
203         return testName;
204     }
205     virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
206                                   cl_command_queue queue);
ExecuteForEachPointerType(cl_device_id deviceID,cl_context context,cl_command_queue queue)207     int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context,
208                                   cl_command_queue queue)
209     {
210         int error = 0;
211         UsedInFunction(false);
212         EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
213         UsedInFunction(true);
214         GenericAddrSpace(false);
215         EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
216         GenericAddrSpace(true);
217         EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
218         GenericAddrSpace(false);
219         return error;
220     }
ExecuteForEachDeclarationType(cl_device_id deviceID,cl_context context,cl_command_queue queue)221     int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context,
222                                       cl_command_queue queue)
223     {
224         int error = 0;
225         DeclaredInProgram(false);
226         EXECUTE_TEST(error,
227                      ExecuteForEachPointerType(deviceID, context, queue));
228         if (!UseSVM())
229         {
230             DeclaredInProgram(true);
231             EXECUTE_TEST(error,
232                          ExecuteForEachPointerType(deviceID, context, queue));
233         }
234         return error;
235     }
ExecuteForEachParameterSet(cl_device_id deviceID,cl_context context,cl_command_queue queue)236     virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
237                                            cl_context context,
238                                            cl_command_queue queue)
239     {
240         int error = 0;
241         if (_maxDeviceThreads > 0 && !UseSVM())
242         {
243             LocalMemory(true);
244             EXECUTE_TEST(
245                 error, ExecuteForEachDeclarationType(deviceID, context, queue));
246         }
247         if (_maxDeviceThreads + MaxHostThreads() > 0)
248         {
249             LocalMemory(false);
250             EXECUTE_TEST(
251                 error, ExecuteForEachDeclarationType(deviceID, context, queue));
252         }
253         return error;
254     }
Execute(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)255     virtual int Execute(cl_device_id deviceID, cl_context context,
256                         cl_command_queue queue, int num_elements)
257     {
258         if (sizeof(HostAtomicType) != DataType().Size(deviceID))
259         {
260             log_info("Invalid test: Host atomic type size (%u) is different "
261                      "than OpenCL type size (%u)\n",
262                      (cl_uint)sizeof(HostAtomicType),
263                      DataType().Size(deviceID));
264             return -1;
265         }
266         if (sizeof(HostAtomicType) != sizeof(HostDataType))
267         {
268             log_info("Invalid test: Host atomic type size (%u) is different "
269                      "than corresponding type size (%u)\n",
270                      (cl_uint)sizeof(HostAtomicType),
271                      (cl_uint)sizeof(HostDataType));
272             return -1;
273         }
274         // Verify we can run first
275         if (UseSVM() && !gUseHostPtr)
276         {
277             cl_device_svm_capabilities caps;
278             cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES,
279                                            sizeof(caps), &caps, 0);
280             test_error(error, "clGetDeviceInfo failed");
281             if ((caps & CL_DEVICE_SVM_ATOMICS) == 0)
282             {
283                 log_info("\t%s - SVM_ATOMICS not supported\n",
284                          DataType().AtomicTypeName());
285                 // implicit pass
286                 return 0;
287             }
288         }
289         if (!DataType().IsSupported(deviceID))
290         {
291             log_info("\t%s not supported\n", DataType().AtomicTypeName());
292             // implicit pass or host test (debug feature)
293             if (UseSVM()) return 0;
294             _maxDeviceThreads = 0;
295         }
296         if (_maxDeviceThreads + MaxHostThreads() == 0) return 0;
297         return ExecuteForEachParameterSet(deviceID, context, queue);
298     }
HostFunction(cl_uint tid,cl_uint threadCount,volatile HostAtomicType * destMemory,HostDataType * oldValues)299     virtual void HostFunction(cl_uint tid, cl_uint threadCount,
300                               volatile HostAtomicType *destMemory,
301                               HostDataType *oldValues)
302     {
303         log_info("Empty thread function %u\n", (cl_uint)tid);
304     }
DataType()305     AtomicTypeExtendedInfo<HostDataType> DataType() const
306     {
307         return AtomicTypeExtendedInfo<HostDataType>(_dataType);
308     }
309     cl_uint _maxDeviceThreads;
MaxHostThreads()310     virtual cl_uint MaxHostThreads()
311     {
312         if (UseSVM() || gHost)
313             return MAX_HOST_THREADS;
314         else
315             return 0;
316     }
317 
CheckCapabilities(TExplicitMemoryScopeType memoryScope,TExplicitMemoryOrderType memoryOrder)318     int CheckCapabilities(TExplicitMemoryScopeType memoryScope,
319                           TExplicitMemoryOrderType memoryOrder)
320     {
321         /*
322             Differentiation between atomic fence and other atomic operations
323             does not need to occur here.
324 
325             The initialisation of this test checks that the minimum required
326             capabilities are supported by this device.
327 
328             The following switches allow the test to skip if optional
329            capabilites are not supported by the device.
330           */
331         switch (memoryScope)
332         {
333             case MEMORY_SCOPE_EMPTY: {
334                 break;
335             }
336             case MEMORY_SCOPE_WORK_GROUP: {
337                 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0)
338                 {
339                     return TEST_SKIPPED_ITSELF;
340                 }
341                 break;
342             }
343             case MEMORY_SCOPE_DEVICE: {
344                 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
345                 {
346                     return TEST_SKIPPED_ITSELF;
347                 }
348                 break;
349             }
350             case MEMORY_SCOPE_ALL_DEVICES: // fallthough
351             case MEMORY_SCOPE_ALL_SVM_DEVICES: {
352                 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0)
353                 {
354                     return TEST_SKIPPED_ITSELF;
355                 }
356                 break;
357             }
358             default: {
359                 log_info("Invalid memory scope\n");
360                 break;
361             }
362         }
363 
364         switch (memoryOrder)
365         {
366             case MEMORY_ORDER_EMPTY: {
367                 break;
368             }
369             case MEMORY_ORDER_RELAXED: {
370                 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0)
371                 {
372                     return TEST_SKIPPED_ITSELF;
373                 }
374                 break;
375             }
376             case MEMORY_ORDER_ACQUIRE:
377             case MEMORY_ORDER_RELEASE:
378             case MEMORY_ORDER_ACQ_REL: {
379                 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0)
380                 {
381                     return TEST_SKIPPED_ITSELF;
382                 }
383                 break;
384             }
385             case MEMORY_ORDER_SEQ_CST: {
386                 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0)
387                 {
388                     return TEST_SKIPPED_ITSELF;
389                 }
390                 break;
391             }
392             default: {
393                 log_info("Invalid memory order\n");
394                 break;
395             }
396         }
397 
398         return 0;
399     }
SVMDataBufferAllSVMConsistent()400     virtual bool SVMDataBufferAllSVMConsistent() { return false; }
UseSVM()401     bool UseSVM() { return _useSVM; }
StartValue(HostDataType startValue)402     void StartValue(HostDataType startValue) { _startValue = startValue; }
StartValue()403     HostDataType StartValue() { return _startValue; }
LocalMemory(bool local)404     void LocalMemory(bool local) { _localMemory = local; }
LocalMemory()405     bool LocalMemory() { return _localMemory; }
DeclaredInProgram(bool declaredInProgram)406     void DeclaredInProgram(bool declaredInProgram)
407     {
408         _declaredInProgram = declaredInProgram;
409     }
DeclaredInProgram()410     bool DeclaredInProgram() { return _declaredInProgram; }
UsedInFunction(bool local)411     void UsedInFunction(bool local) { _usedInFunction = local; }
UsedInFunction()412     bool UsedInFunction() { return _usedInFunction; }
GenericAddrSpace(bool genericAddrSpace)413     void GenericAddrSpace(bool genericAddrSpace)
414     {
415         _genericAddrSpace = genericAddrSpace;
416     }
GenericAddrSpace()417     bool GenericAddrSpace() { return _genericAddrSpace; }
OldValueCheck(bool check)418     void OldValueCheck(bool check) { _oldValueCheck = check; }
OldValueCheck()419     bool OldValueCheck() { return _oldValueCheck; }
LocalRefValues(bool localRefValues)420     void LocalRefValues(bool localRefValues)
421     {
422         _localRefValues = localRefValues;
423     }
LocalRefValues()424     bool LocalRefValues() { return _localRefValues; }
MaxGroupSize(cl_uint maxGroupSize)425     void MaxGroupSize(cl_uint maxGroupSize) { _maxGroupSize = maxGroupSize; }
MaxGroupSize()426     cl_uint MaxGroupSize() { return _maxGroupSize; }
CurrentGroupSize(cl_uint currentGroupSize)427     void CurrentGroupSize(cl_uint currentGroupSize)
428     {
429         if (MaxGroupSize() && MaxGroupSize() < currentGroupSize)
430             _currentGroupSize = MaxGroupSize();
431         else
432             _currentGroupSize = currentGroupSize;
433     }
CurrentGroupSize()434     cl_uint CurrentGroupSize() { return _currentGroupSize; }
CurrentGroupNum(cl_uint threadCount)435     virtual cl_uint CurrentGroupNum(cl_uint threadCount)
436     {
437         if (threadCount == 0) return 0;
438         if (LocalMemory()) return 1;
439         return threadCount / CurrentGroupSize();
440     }
Iterations()441     cl_int Iterations() { return _iterations; }
IterationsStr()442     std::string IterationsStr()
443     {
444         std::stringstream ss;
445         ss << _iterations;
446         return ss.str();
447     }
448 
449 private:
450     const TExplicitAtomicType _dataType;
451     const bool _useSVM;
452     HostDataType _startValue;
453     bool _localMemory;
454     bool _declaredInProgram;
455     bool _usedInFunction;
456     bool _genericAddrSpace;
457     bool _oldValueCheck;
458     bool _localRefValues;
459     cl_uint _maxGroupSize;
460     cl_uint _currentGroupSize;
461     cl_uint _passCount;
462     const cl_int _iterations;
463 };
464 
465 template <typename HostAtomicType, typename HostDataType>
466 class CBasicTestMemOrderScope
467     : public CBasicTest<HostAtomicType, HostDataType> {
468 public:
469     using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
470     using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
471     using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
472     CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false)
473         : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
474     {}
ProgramHeader(cl_uint maxNumDestItems)475     virtual std::string ProgramHeader(cl_uint maxNumDestItems)
476     {
477         std::string header;
478         if (gOldAPI)
479         {
480             std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
481             header += "#define atomic_store_explicit(x,y,o" + s
482                 + ")                     atomic_store(x,y)\n"
483                   "#define atomic_load_explicit(x,o"
484                 + s
485                 + ")                        atomic_load(x)\n"
486                   "#define atomic_exchange_explicit(x,y,o"
487                 + s
488                 + ")                  atomic_exchange(x,y)\n"
489                   "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"
490                 + s
491                 + ") atomic_compare_exchange_strong(x,y,z)\n"
492                   "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"
493                 + s
494                 + ")   atomic_compare_exchange_weak(x,y,z)\n"
495                   "#define atomic_fetch_add_explicit(x,y,o"
496                 + s
497                 + ")                 atomic_fetch_add(x,y)\n"
498                   "#define atomic_fetch_sub_explicit(x,y,o"
499                 + s
500                 + ")                 atomic_fetch_sub(x,y)\n"
501                   "#define atomic_fetch_or_explicit(x,y,o"
502                 + s
503                 + ")                  atomic_fetch_or(x,y)\n"
504                   "#define atomic_fetch_xor_explicit(x,y,o"
505                 + s
506                 + ")                 atomic_fetch_xor(x,y)\n"
507                   "#define atomic_fetch_and_explicit(x,y,o"
508                 + s
509                 + ")                 atomic_fetch_and(x,y)\n"
510                   "#define atomic_fetch_min_explicit(x,y,o"
511                 + s
512                 + ")                 atomic_fetch_min(x,y)\n"
513                   "#define atomic_fetch_max_explicit(x,y,o"
514                 + s
515                 + ")                 atomic_fetch_max(x,y)\n"
516                   "#define atomic_flag_test_and_set_explicit(x,o"
517                 + s
518                 + ")           atomic_flag_test_and_set(x)\n"
519                   "#define atomic_flag_clear_explicit(x,o"
520                 + s + ")                  atomic_flag_clear(x)\n";
521         }
522         return header
523             + CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(
524                    maxNumDestItems);
525     }
SingleTestName()526     virtual std::string SingleTestName()
527     {
528         std::string testName =
529             CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
530         if (MemoryOrder() != MEMORY_ORDER_EMPTY)
531         {
532             testName += std::string(", ")
533                 + std::string(get_memory_order_type_name(MemoryOrder()))
534                       .substr(sizeof("memory"));
535         }
536         if (MemoryScope() != MEMORY_SCOPE_EMPTY)
537         {
538             testName += std::string(", ")
539                 + std::string(get_memory_scope_type_name(MemoryScope()))
540                       .substr(sizeof("memory"));
541         }
542         return testName;
543     }
ExecuteSingleTest(cl_device_id deviceID,cl_context context,cl_command_queue queue)544     virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
545                                   cl_command_queue queue)
546     {
547         if (LocalMemory() && MemoryScope() != MEMORY_SCOPE_EMPTY
548             && MemoryScope()
549                 != MEMORY_SCOPE_WORK_GROUP) // memory scope should only be used
550                                             // for global memory
551             return 0;
552         if (MemoryScope() == MEMORY_SCOPE_DEVICE)
553             MaxGroupSize(
554                 16); // increase number of groups by forcing smaller group size
555         else
556             MaxGroupSize(0); // group size limited by device capabilities
557 
558         if (CheckCapabilities(MemoryScope(), MemoryOrder())
559             == TEST_SKIPPED_ITSELF)
560             return 0; // skip test - not applicable
561 
562         return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
563             deviceID, context, queue);
564     }
ExecuteForEachParameterSet(cl_device_id deviceID,cl_context context,cl_command_queue queue)565     virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
566                                            cl_context context,
567                                            cl_command_queue queue)
568     {
569         // repeat test for each reasonable memory order/scope combination
570         std::vector<TExplicitMemoryOrderType> memoryOrder;
571         std::vector<TExplicitMemoryScopeType> memoryScope;
572         int error = 0;
573 
574         // For OpenCL-3.0 and later some orderings and scopes are optional, so
575         // here we query for the supported ones.
576         test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder,
577                                                          memoryScope),
578                        "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
579 
580         for (unsigned oi = 0; oi < memoryOrder.size(); oi++)
581         {
582             for (unsigned si = 0; si < memoryScope.size(); si++)
583             {
584                 if (memoryOrder[oi] == MEMORY_ORDER_EMPTY
585                     && memoryScope[si] != MEMORY_SCOPE_EMPTY)
586                     continue;
587                 MemoryOrder(memoryOrder[oi]);
588                 MemoryScope(memoryScope[si]);
589                 EXECUTE_TEST(
590                     error,
591                     (CBasicTest<HostAtomicType, HostDataType>::
592                          ExecuteForEachParameterSet(deviceID, context, queue)));
593             }
594         }
595         return error;
596     }
MemoryOrder(TExplicitMemoryOrderType memoryOrder)597     void MemoryOrder(TExplicitMemoryOrderType memoryOrder)
598     {
599         _memoryOrder = memoryOrder;
600     }
MemoryOrder()601     TExplicitMemoryOrderType MemoryOrder() { return _memoryOrder; }
MemoryOrderStr()602     std::string MemoryOrderStr()
603     {
604         if (MemoryOrder() != MEMORY_ORDER_EMPTY)
605             return std::string(", ")
606                 + get_memory_order_type_name(MemoryOrder());
607         return "";
608     }
MemoryScope(TExplicitMemoryScopeType memoryScope)609     void MemoryScope(TExplicitMemoryScopeType memoryScope)
610     {
611         _memoryScope = memoryScope;
612     }
MemoryScope()613     TExplicitMemoryScopeType MemoryScope() { return _memoryScope; }
MemoryScopeStr()614     std::string MemoryScopeStr()
615     {
616         if (MemoryScope() != MEMORY_SCOPE_EMPTY)
617             return std::string(", ")
618                 + get_memory_scope_type_name(MemoryScope());
619         return "";
620     }
MemoryOrderScopeStr()621     std::string MemoryOrderScopeStr()
622     {
623         return MemoryOrderStr() + MemoryScopeStr();
624     }
CurrentGroupNum(cl_uint threadCount)625     virtual cl_uint CurrentGroupNum(cl_uint threadCount)
626     {
627         if (MemoryScope() == MEMORY_SCOPE_WORK_GROUP) return 1;
628         return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(
629             threadCount);
630     }
MaxHostThreads()631     virtual cl_uint MaxHostThreads()
632     {
633         // block host threads execution for memory scope different than
634         // memory_scope_all_svm_devices
635         if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
636             || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
637         {
638             return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
639         }
640         else
641         {
642             return 0;
643         }
644     }
645 
646 private:
647     TExplicitMemoryOrderType _memoryOrder;
648     TExplicitMemoryScopeType _memoryScope;
649 };
650 
651 template <typename HostAtomicType, typename HostDataType>
652 class CBasicTestMemOrder2Scope
653     : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
654 public:
655     using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
656     using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
657     using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
658     using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
659     using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
660     using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
661 
662     CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false)
663         : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
664                                                                 useSVM)
665     {}
SingleTestName()666     virtual std::string SingleTestName()
667     {
668         std::string testName =
669             CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
670         if (MemoryOrder() != MEMORY_ORDER_EMPTY)
671             testName += std::string(", ")
672                 + std::string(get_memory_order_type_name(MemoryOrder()))
673                       .substr(sizeof("memory"));
674         if (MemoryOrder2() != MEMORY_ORDER_EMPTY)
675             testName += std::string(", ")
676                 + std::string(get_memory_order_type_name(MemoryOrder2()))
677                       .substr(sizeof("memory"));
678         if (MemoryScope() != MEMORY_SCOPE_EMPTY)
679             testName += std::string(", ")
680                 + std::string(get_memory_scope_type_name(MemoryScope()))
681                       .substr(sizeof("memory"));
682         return testName;
683     }
ExecuteForEachParameterSet(cl_device_id deviceID,cl_context context,cl_command_queue queue)684     virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
685                                            cl_context context,
686                                            cl_command_queue queue)
687     {
688         // repeat test for each reasonable memory order/scope combination
689         std::vector<TExplicitMemoryOrderType> memoryOrder;
690         std::vector<TExplicitMemoryScopeType> memoryScope;
691         int error = 0;
692 
693         // For OpenCL-3.0 and later some orderings and scopes are optional, so
694         // here we query for the supported ones.
695         test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder,
696                                                          memoryScope),
697                        "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
698 
699         for (unsigned oi = 0; oi < memoryOrder.size(); oi++)
700         {
701             for (unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
702             {
703                 for (unsigned si = 0; si < memoryScope.size(); si++)
704                 {
705                     if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY
706                          || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
707                         && memoryOrder[oi] != memoryOrder[o2i])
708                         continue; // both memory order arguments must be set (or
709                                   // none)
710                     if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY
711                          || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
712                         && memoryScope[si] != MEMORY_SCOPE_EMPTY)
713                         continue; // memory scope without memory order is not
714                                   // allowed
715                     MemoryOrder(memoryOrder[oi]);
716                     MemoryOrder2(memoryOrder[o2i]);
717                     MemoryScope(memoryScope[si]);
718 
719                     if (CheckCapabilities(MemoryScope(), MemoryOrder())
720                         == TEST_SKIPPED_ITSELF)
721                         continue; // skip test - not applicable
722 
723                     if (CheckCapabilities(MemoryScope(), MemoryOrder2())
724                         == TEST_SKIPPED_ITSELF)
725                         continue; // skip test - not applicable
726 
727                     EXECUTE_TEST(error,
728                                  (CBasicTest<HostAtomicType, HostDataType>::
729                                       ExecuteForEachParameterSet(
730                                           deviceID, context, queue)));
731                 }
732             }
733         }
734         return error;
735     }
MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail)736     void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail)
737     {
738         _memoryOrder2 = memoryOrderFail;
739     }
MemoryOrder2()740     TExplicitMemoryOrderType MemoryOrder2() { return _memoryOrder2; }
MemoryOrderFailStr()741     std::string MemoryOrderFailStr()
742     {
743         if (MemoryOrder2() != MEMORY_ORDER_EMPTY)
744             return std::string(", ")
745                 + get_memory_order_type_name(MemoryOrder2());
746         return "";
747     }
MemoryOrderScope()748     std::string MemoryOrderScope()
749     {
750         return MemoryOrderStr() + MemoryOrderFailStr() + MemoryScopeStr();
751     }
752 
753 private:
754     TExplicitMemoryOrderType _memoryOrder2;
755 };
756 
757 template <typename HostAtomicType, typename HostDataType>
758 std::string
PragmaHeader(cl_device_id deviceID)759 CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
760 {
761     std::string pragma;
762 
763     if (gOldAPI)
764     {
765         pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : "
766                   "enable\n";
767         pragma += "#pragma OPENCL EXTENSION "
768                   "cl_khr_local_int32_extended_atomics : enable\n";
769         pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : "
770                   "enable\n";
771         pragma += "#pragma OPENCL EXTENSION "
772                   "cl_khr_global_int32_extended_atomics : enable\n";
773     }
774     // Create the pragma lines for this kernel
775     if (DataType().Size(deviceID) == 8)
776     {
777         pragma +=
778             "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
779         pragma +=
780             "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
781     }
782     if (_dataType == TYPE_ATOMIC_DOUBLE)
783         pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
784     return pragma;
785 }
786 
787 template <typename HostAtomicType, typename HostDataType>
788 std::string
ProgramHeader(cl_uint maxNumDestItems)789 CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
790 {
791     // Create the program header
792     std::string header;
793     std::string aTypeName = DataType().AtomicTypeName();
794     std::string cTypeName = DataType().RegularTypeName();
795     std::string argListForKernel;
796     std::string argListForFunction;
797     std::string argListNoTypes;
798     std::string functionPrototype;
799     std::string addressSpace = LocalMemory() ? "__local " : "__global ";
800 
801     if (gOldAPI)
802     {
803         header += std::string("#define ") + aTypeName + " " + cTypeName
804             + "\n"
805               "#define atomic_store(x,y)                                (*(x) "
806               "= y)\n"
807               "#define atomic_load(x)                                   "
808               "(*(x))\n"
809               "#define ATOMIC_VAR_INIT(x)                               (x)\n"
810               "#define ATOMIC_FLAG_INIT                                 0\n"
811               "#define atomic_init(x,y)                                 "
812               "atomic_store(x,y)\n";
813         if (aTypeName == "atomic_float")
814             header += "#define atomic_exchange(x,y)                            "
815                       " atomic_xchg(x,y)\n";
816         else if (aTypeName == "atomic_double")
817             header += "double atomic_exchange(volatile " + addressSpace
818                 + "atomic_double *x, double y)\n"
819                   "{\n"
820                   "  long tmp = *(long*)&y, res;\n"
821                   "  volatile "
822                 + addressSpace + "long *tmpA = (volatile " + addressSpace
823                 + "long)x;\n"
824                   "  res = atom_xchg(tmpA,tmp);\n"
825                   "  return *(double*)&res;\n"
826                   "}\n";
827         else
828             header += "#define atomic_exchange(x,y)                            "
829                       " atom_xchg(x,y)\n";
830         if (aTypeName != "atomic_float" && aTypeName != "atomic_double")
831             header += "bool atomic_compare_exchange_strong(volatile "
832                 + addressSpace + " " + aTypeName + " *a, " + cTypeName
833                 + " *expected, " + cTypeName
834                 + " desired)\n"
835                   "{\n"
836                   "  "
837                 + cTypeName
838                 + " old = atom_cmpxchg(a, *expected, desired);\n"
839                   "  if(old == *expected)\n"
840                   "    return true;\n"
841                   "  *expected = old;\n"
842                   "  return false;\n"
843                   "}\n"
844                   "#define atomic_compare_exchange_weak                     "
845                   "atomic_compare_exchange_strong\n";
846         header += "#define atomic_fetch_add(x,y)                            "
847                   "atom_add(x,y)\n"
848                   "#define atomic_fetch_sub(x,y)                            "
849                   "atom_sub(x,y)\n"
850                   "#define atomic_fetch_or(x,y)                             "
851                   "atom_or(x,y)\n"
852                   "#define atomic_fetch_xor(x,y)                            "
853                   "atom_xor(x,y)\n"
854                   "#define atomic_fetch_and(x,y)                            "
855                   "atom_and(x,y)\n"
856                   "#define atomic_fetch_min(x,y)                            "
857                   "atom_min(x,y)\n"
858                   "#define atomic_fetch_max(x,y)                            "
859                   "atom_max(x,y)\n"
860                   "#define atomic_flag_test_and_set(x)                      "
861                   "atomic_exchange(x,1)\n"
862                   "#define atomic_flag_clear(x)                             "
863                   "atomic_store(x,0)\n"
864                   "\n";
865     }
866     if (!LocalMemory() && DeclaredInProgram())
867     {
868         // additional atomic variable for results copying (last thread will do
869         // this)
870         header += "__global volatile atomic_uint finishedThreads = "
871                   "ATOMIC_VAR_INIT(0);\n";
872         // atomic variables declared in program scope - test data
873         std::stringstream ss;
874         ss << maxNumDestItems;
875         header += std::string("__global volatile ") + aTypeName + " destMemory["
876             + ss.str() + "] = {\n";
877         ss.str("");
878         ss << _startValue;
879         for (cl_uint i = 0; i < maxNumDestItems; i++)
880         {
881             if (aTypeName == "atomic_flag")
882                 header += "  ATOMIC_FLAG_INIT";
883             else
884                 header += "  ATOMIC_VAR_INIT(" + ss.str() + ")";
885             if (i + 1 < maxNumDestItems) header += ",";
886             header += "\n";
887         }
888         header += "};\n"
889                   "\n";
890     }
891     return header;
892 }
893 
894 template <typename HostAtomicType, typename HostDataType>
FunctionCode()895 std::string CBasicTest<HostAtomicType, HostDataType>::FunctionCode()
896 {
897     if (!UsedInFunction()) return "";
898     std::string addressSpace = LocalMemory() ? "__local " : "__global ";
899     std::string code = "void test_atomic_function(uint tid, uint threadCount, "
900                        "uint numDestItems, volatile ";
901     if (!GenericAddrSpace()) code += addressSpace;
902     code += std::string(DataType().AtomicTypeName()) + " *destMemory, __global "
903         + DataType().RegularTypeName() + " *oldValues";
904     if (LocalRefValues())
905         code += std::string(", __local ") + DataType().RegularTypeName()
906             + " *localValues";
907     code += ")\n"
908             "{\n";
909     code += ProgramCore();
910     code += "}\n"
911             "\n";
912     return code;
913 }
914 
915 template <typename HostAtomicType, typename HostDataType>
916 std::string
KernelCode(cl_uint maxNumDestItems)917 CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
918 {
919     std::string aTypeName = DataType().AtomicTypeName();
920     std::string cTypeName = DataType().RegularTypeName();
921     std::string addressSpace = LocalMemory() ? "__local " : "__global ";
922     std::string code = "__kernel void test_atomic_kernel(uint threadCount, "
923                        "uint numDestItems, ";
924 
925     // prepare list of arguments for kernel
926     if (LocalMemory())
927     {
928         code += std::string("__global ") + cTypeName + " *finalDest, __global "
929             + cTypeName
930             + " *oldValues,"
931               " volatile "
932             + addressSpace + aTypeName + " *"
933             + (DeclaredInProgram() ? "notUsed" : "") + "destMemory";
934     }
935     else
936     {
937         code += "volatile " + addressSpace
938             + (DeclaredInProgram() ? (cTypeName + " *finalDest")
939                                    : (aTypeName + " *destMemory"))
940             + ", __global " + cTypeName + " *oldValues";
941     }
942     if (LocalRefValues())
943         code += std::string(", __local ") + cTypeName + " *localValues";
944     code += ")\n"
945             "{\n";
946     if (LocalMemory() && DeclaredInProgram())
947     {
948         // local atomics declared in kernel scope
949         std::stringstream ss;
950         ss << maxNumDestItems;
951         code += std::string("  __local volatile ") + aTypeName + " destMemory["
952             + ss.str() + "];\n";
953     }
954     code += "  uint  tid = get_global_id(0);\n"
955             "\n";
956     if (LocalMemory())
957     {
958         // memory_order_relaxed is sufficient for these initialization
959         // operations as the barrier below will act as a fence, providing an
960         // order to the operations. memory_scope_work_group is sufficient as
961         // local memory is only visible within the work-group.
962         code += R"(
963               // initialize atomics not reachable from host (first thread
964               // is doing this, other threads are waiting on barrier)
965               if(get_local_id(0) == 0)
966                 for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)
967                 {)";
968         if (aTypeName == "atomic_flag")
969         {
970             code += R"(
971                   if(finalDest[dstItemIdx])
972                     atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
973                                                       memory_order_relaxed,
974                                                       memory_scope_work_group);
975                   else
976                     atomic_flag_clear_explicit(destMemory+dstItemIdx,
977                                                memory_order_relaxed,
978                                                memory_scope_work_group);)";
979         }
980         else
981         {
982             code += R"(
983                 atomic_store_explicit(destMemory+dstItemIdx,
984                                       finalDest[dstItemIdx],
985                                       memory_order_relaxed,
986                                       memory_scope_work_group);)";
987         }
988         code += "    }\n"
989                 "  barrier(CLK_LOCAL_MEM_FENCE);\n"
990                 "\n";
991     }
992     if (LocalRefValues())
993     {
994         code += "  // Copy input reference values into local memory\n";
995         if (NumNonAtomicVariablesPerThread() == 1)
996             code += "  localValues[get_local_id(0)] = oldValues[tid];\n";
997         else
998         {
999             std::stringstream ss;
1000             ss << NumNonAtomicVariablesPerThread();
1001             code += "  for(uint rfId = 0; rfId < " + ss.str()
1002                 + "; rfId++)\n"
1003                   "    localValues[get_local_id(0)*"
1004                 + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
1005         }
1006         code += "  barrier(CLK_LOCAL_MEM_FENCE);\n"
1007                 "\n";
1008     }
1009     if (UsedInFunction())
1010         code += std::string("  test_atomic_function(tid, threadCount, "
1011                             "numDestItems, destMemory, oldValues")
1012             + (LocalRefValues() ? ", localValues" : "") + ");\n";
1013     else
1014         code += ProgramCore();
1015     code += "\n";
1016     if (LocalRefValues())
1017     {
1018         code += "  // Copy local reference values into output array\n"
1019                 "  barrier(CLK_LOCAL_MEM_FENCE);\n";
1020         if (NumNonAtomicVariablesPerThread() == 1)
1021             code += "  oldValues[tid] = localValues[get_local_id(0)];\n";
1022         else
1023         {
1024             std::stringstream ss;
1025             ss << NumNonAtomicVariablesPerThread();
1026             code += "  for(uint rfId = 0; rfId < " + ss.str()
1027                 + "; rfId++)\n"
1028                   "    oldValues[tid*"
1029                 + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str()
1030                 + "+rfId];\n";
1031         }
1032         code += "\n";
1033     }
1034     if (LocalMemory())
1035     {
1036         code += "  // Copy final values to host reachable buffer\n";
1037         code += "  barrier(CLK_LOCAL_MEM_FENCE);\n"
1038                 "  if(get_local_id(0) == 0) // first thread in workgroup\n";
1039         code += "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; "
1040                 "dstItemIdx++)\n";
1041         if (aTypeName == "atomic_flag")
1042         {
1043             code += R"(
1044                 finalDest[dstItemIdx] =
1045                     atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
1046                                                       memory_order_relaxed,
1047                                                       memory_scope_work_group);)";
1048         }
1049         else
1050         {
1051             code += R"(
1052                 finalDest[dstItemIdx] =
1053                     atomic_load_explicit(destMemory+dstItemIdx,
1054                                          memory_order_relaxed,
1055                                          memory_scope_work_group);)";
1056         }
1057     }
1058     else if (DeclaredInProgram())
1059     {
1060         // global atomics declared in program scope
1061         code += "  // Copy final values to host reachable buffer\n";
1062         code += R"(
1063             if(atomic_fetch_add_explicit(&finishedThreads, 1u,
1064                                          memory_order_acq_rel,
1065                                          memory_scope_device)
1066                    == get_global_size(0)-1) // last finished thread
1067                 )";
1068         code += "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; "
1069                 "dstItemIdx++)\n";
1070         if (aTypeName == "atomic_flag")
1071         {
1072             code += R"(
1073                 finalDest[dstItemIdx] =
1074                     atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
1075                                                       memory_order_relaxed,
1076                                                       memory_scope_device);)";
1077         }
1078         else
1079         {
1080             code += R"(
1081                 finalDest[dstItemIdx] =
1082                     atomic_load_explicit(destMemory+dstItemIdx,
1083                                          memory_order_relaxed,
1084                                          memory_scope_device);)";
1085         }
1086     }
1087     code += "}\n"
1088             "\n";
1089     return code;
1090 }
1091 
1092 template <typename HostAtomicType, typename HostDataType>
ExecuteSingleTest(cl_device_id deviceID,cl_context context,cl_command_queue queue)1093 int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
1094     cl_device_id deviceID, cl_context context, cl_command_queue queue)
1095 {
1096     int error;
1097     clProgramWrapper program;
1098     clKernelWrapper kernel;
1099     size_t threadNum[1];
1100     clMemWrapper streams[2];
1101     std::vector<HostAtomicType> destItems;
1102     HostAtomicType *svmAtomicBuffer = 0;
1103     std::vector<HostDataType> refValues, startRefValues;
1104     HostDataType *svmDataBuffer = 0;
1105     cl_uint deviceThreadCount, hostThreadCount, threadCount;
1106     size_t groupSize = 0;
1107     std::string programSource;
1108     const char *programLine;
1109     MTdata d;
1110     size_t typeSize = DataType().Size(deviceID);
1111 
1112     deviceThreadCount = _maxDeviceThreads;
1113     hostThreadCount = MaxHostThreads();
1114     threadCount = deviceThreadCount + hostThreadCount;
1115 
1116     // log_info("\t%s %s%s...\n", local ? "local" : "global",
1117     // DataType().AtomicTypeName(), memoryOrderScope.c_str());
1118     log_info("\t%s...\n", SingleTestName().c_str());
1119 
1120     if (!LocalMemory() && DeclaredInProgram()
1121         && gNoGlobalVariables) // no support for program scope global variables
1122     {
1123         log_info("\t\tTest disabled\n");
1124         return 0;
1125     }
1126     if (UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
1127     {
1128         log_info("\t\tTest disabled\n");
1129         return 0;
1130     }
1131     if (!LocalMemory() && DeclaredInProgram())
1132     {
1133         if (((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
1134             || ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0))
1135         {
1136             log_info("\t\tTest disabled\n");
1137             return 0;
1138         }
1139     }
1140 
1141     // set up work sizes based on device capabilities and test configuration
1142     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
1143                             sizeof(groupSize), &groupSize, NULL);
1144     test_error(error, "Unable to obtain max work group size for device");
1145     CurrentGroupSize((cl_uint)groupSize);
1146     if (CurrentGroupSize() > deviceThreadCount)
1147         CurrentGroupSize(deviceThreadCount);
1148     if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
1149         deviceThreadCount =
1150             CurrentGroupSize() * CurrentGroupNum(deviceThreadCount);
1151     threadCount = deviceThreadCount + hostThreadCount;
1152 
1153     // If we're given a num_results function, we need to determine how many
1154     // result objects we need. This is the first assessment for current maximum
1155     // number of threads (exact thread count is not known here)
1156     // - needed for program source code generation (arrays of atomics declared
1157     // in program)
1158     cl_uint numDestItems = NumResults(threadCount, deviceID);
1159 
1160     if (deviceThreadCount > 0)
1161     {
1162         // This loop iteratively reduces the workgroup size by 2 and then
1163         // re-generates the kernel with the reduced
1164         // workgroup size until we find a size which is admissible for the
1165         // kernel being run or reduce the wg size to the trivial case of 1
1166         // (which was separately verified to be accurate for the kernel being
1167         // run)
1168 
1169         while ((CurrentGroupSize() > 1))
1170         {
1171             // Re-generate the kernel code with the current group size
1172             if (kernel) clReleaseKernel(kernel);
1173             if (program) clReleaseProgram(program);
1174             programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
1175                 + FunctionCode() + KernelCode(numDestItems);
1176             programLine = programSource.c_str();
1177             if (create_single_kernel_helper_with_build_options(
1178                     context, &program, &kernel, 1, &programLine,
1179                     "test_atomic_kernel", gOldAPI ? "" : nullptr))
1180             {
1181                 return -1;
1182             }
1183             // Get work group size for the new kernel
1184             error = clGetKernelWorkGroupInfo(
1185                 kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize),
1186                 &groupSize, NULL);
1187             test_error(error,
1188                        "Unable to obtain max work group size for device and "
1189                        "kernel combo");
1190 
1191             if (LocalMemory())
1192             {
1193                 cl_ulong usedLocalMemory;
1194                 cl_ulong totalLocalMemory;
1195                 cl_uint maxWorkGroupSize;
1196 
1197                 error = clGetKernelWorkGroupInfo(
1198                     kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
1199                     sizeof(usedLocalMemory), &usedLocalMemory, NULL);
1200                 test_error(error, "clGetKernelWorkGroupInfo failed");
1201 
1202                 error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
1203                                         sizeof(totalLocalMemory),
1204                                         &totalLocalMemory, NULL);
1205                 test_error(error, "clGetDeviceInfo failed");
1206 
1207                 // We know that each work-group is going to use typeSize *
1208                 // deviceThreadCount bytes of local memory
1209                 // so pick the maximum value for deviceThreadCount that uses all
1210                 // the local memory.
1211                 maxWorkGroupSize =
1212                     ((totalLocalMemory - usedLocalMemory) / typeSize);
1213 
1214                 if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
1215             }
1216             if (CurrentGroupSize() <= groupSize)
1217                 break;
1218             else
1219                 CurrentGroupSize(CurrentGroupSize() / 2);
1220         }
1221         if (CurrentGroupSize() > deviceThreadCount)
1222             CurrentGroupSize(deviceThreadCount);
1223         if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
1224             deviceThreadCount =
1225                 CurrentGroupSize() * CurrentGroupNum(deviceThreadCount);
1226         threadCount = deviceThreadCount + hostThreadCount;
1227     }
1228     if (gDebug)
1229     {
1230         log_info("Program source:\n");
1231         log_info("%s\n", programLine);
1232     }
1233     if (deviceThreadCount > 0)
1234         log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount,
1235                  CurrentGroupSize());
1236     if (hostThreadCount > 0)
1237         log_info("\t\t(host threads %u)\n", hostThreadCount);
1238 
1239     refValues.resize(threadCount * NumNonAtomicVariablesPerThread());
1240 
1241     // Generate ref data if we have a ref generator provided
1242     d = init_genrand(gRandomSeed);
1243     startRefValues.resize(threadCount * NumNonAtomicVariablesPerThread());
1244     if (GenerateRefs(threadCount, &startRefValues[0], d))
1245     {
1246         // copy ref values for host threads
1247         memcpy(&refValues[0], &startRefValues[0],
1248                sizeof(HostDataType) * threadCount
1249                    * NumNonAtomicVariablesPerThread());
1250     }
1251     else
1252     {
1253         startRefValues.resize(0);
1254     }
1255     free_mtdata(d);
1256     d = NULL;
1257 
1258     // If we're given a num_results function, we need to determine how many
1259     // result objects we need. If we don't have it, we assume it's just 1 This
1260     // is final value (exact thread count is known in this place)
1261     numDestItems = NumResults(threadCount, deviceID);
1262 
1263     destItems.resize(numDestItems);
1264     for (cl_uint i = 0; i < numDestItems; i++) destItems[i] = _startValue;
1265 
1266     // Create main buffer with atomic variables (array size dependent on
1267     // particular test)
1268     if (UseSVM())
1269     {
1270         if (gUseHostPtr)
1271             svmAtomicBuffer = (HostAtomicType *)malloc(typeSize * numDestItems);
1272         else
1273             svmAtomicBuffer = (HostAtomicType *)clSVMAlloc(
1274                 context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
1275                 typeSize * numDestItems, 0);
1276         if (!svmAtomicBuffer)
1277         {
1278             log_error("ERROR: clSVMAlloc failed!\n");
1279             return -1;
1280         }
1281         memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
1282         streams[0] =
1283             clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
1284                            typeSize * numDestItems, svmAtomicBuffer, NULL);
1285     }
1286     else
1287     {
1288         streams[0] =
1289             clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
1290                            typeSize * numDestItems, &destItems[0], NULL);
1291     }
1292     if (!streams[0])
1293     {
1294         log_error("ERROR: Creating output array failed!\n");
1295         return -1;
1296     }
1297     // Create buffer for per-thread input/output data
1298     if (UseSVM())
1299     {
1300         if (gUseHostPtr)
1301             svmDataBuffer = (HostDataType *)malloc(
1302                 typeSize * threadCount * NumNonAtomicVariablesPerThread());
1303         else
1304             svmDataBuffer = (HostDataType *)clSVMAlloc(
1305                 context,
1306                 CL_MEM_SVM_FINE_GRAIN_BUFFER
1307                     | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS
1308                                                        : 0),
1309                 typeSize * threadCount * NumNonAtomicVariablesPerThread(), 0);
1310         if (!svmDataBuffer)
1311         {
1312             log_error("ERROR: clSVMAlloc failed!\n");
1313             return -1;
1314         }
1315         if (startRefValues.size())
1316             memcpy(svmDataBuffer, &startRefValues[0],
1317                    typeSize * threadCount * NumNonAtomicVariablesPerThread());
1318         streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
1319                                     typeSize * threadCount
1320                                         * NumNonAtomicVariablesPerThread(),
1321                                     svmDataBuffer, NULL);
1322     }
1323     else
1324     {
1325         streams[1] = clCreateBuffer(
1326             context,
1327             ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR
1328                                     : CL_MEM_READ_WRITE)),
1329             typeSize * threadCount * NumNonAtomicVariablesPerThread(),
1330             startRefValues.size() ? &startRefValues[0] : 0, NULL);
1331     }
1332     if (!streams[1])
1333     {
1334         log_error("ERROR: Creating reference array failed!\n");
1335         return -1;
1336     }
1337     if (deviceThreadCount > 0)
1338     {
1339         cl_uint argInd = 0;
1340         /* Set the arguments */
1341         error =
1342             clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
1343         test_error(error, "Unable to set kernel argument");
1344         error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems),
1345                                &numDestItems);
1346         test_error(error, "Unable to set indexed kernel argument");
1347         error =
1348             clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
1349         test_error(error, "Unable to set indexed kernel arguments");
1350         error =
1351             clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
1352         test_error(error, "Unable to set indexed kernel arguments");
1353         if (LocalMemory())
1354         {
1355             error =
1356                 clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
1357             test_error(error, "Unable to set indexed local kernel argument");
1358         }
1359         if (LocalRefValues())
1360         {
1361             error =
1362                 clSetKernelArg(kernel, argInd++,
1363                                LocalRefValues() ? typeSize
1364                                        * (CurrentGroupSize()
1365                                           * NumNonAtomicVariablesPerThread())
1366                                                 : 1,
1367                                NULL);
1368             test_error(error, "Unable to set indexed kernel argument");
1369         }
1370     }
1371     /* Configure host threads */
1372     std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
1373     for (unsigned int t = 0; t < hostThreadCount; t++)
1374     {
1375         hostThreadContexts[t].test = this;
1376         hostThreadContexts[t].tid = deviceThreadCount + t;
1377         hostThreadContexts[t].threadCount = threadCount;
1378         hostThreadContexts[t].destMemory =
1379             UseSVM() ? svmAtomicBuffer : &destItems[0];
1380         hostThreadContexts[t].oldValues =
1381             UseSVM() ? svmDataBuffer : &refValues[0];
1382     }
1383 
1384     if (deviceThreadCount > 0)
1385     {
1386         /* Run the kernel */
1387         threadNum[0] = deviceThreadCount;
1388         groupSize = CurrentGroupSize();
1389         error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum,
1390                                        &groupSize, 0, NULL, NULL);
1391         test_error(error, "Unable to execute test kernel");
1392         /* start device threads */
1393         error = clFlush(queue);
1394         test_error(error, "clFlush failed");
1395     }
1396 
1397     /* Start host threads and wait for finish */
1398     if (hostThreadCount > 0)
1399         ThreadPool_Do(HostThreadFunction, hostThreadCount,
1400                       &hostThreadContexts[0]);
1401 
1402     if (UseSVM())
1403     {
1404         error = clFinish(queue);
1405         test_error(error, "clFinish failed");
1406         memcpy(&destItems[0], svmAtomicBuffer, typeSize * numDestItems);
1407         memcpy(&refValues[0], svmDataBuffer,
1408                typeSize * threadCount * NumNonAtomicVariablesPerThread());
1409     }
1410     else
1411     {
1412         if (deviceThreadCount > 0)
1413         {
1414             error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0,
1415                                         typeSize * numDestItems, &destItems[0],
1416                                         0, NULL, NULL);
1417             test_error(error, "Unable to read result value!");
1418             error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
1419                                         typeSize * deviceThreadCount
1420                                             * NumNonAtomicVariablesPerThread(),
1421                                         &refValues[0], 0, NULL, NULL);
1422             test_error(error, "Unable to read reference values!");
1423         }
1424     }
1425     bool dataVerified = false;
1426     // If we have an expectedFn, then we need to generate a final value to
1427     // compare against. If we don't have one, it's because we're comparing ref
1428     // values only
1429     for (cl_uint i = 0; i < numDestItems; i++)
1430     {
1431         HostDataType expected;
1432 
1433         if (!ExpectedValue(expected, threadCount,
1434                            startRefValues.size() ? &startRefValues[0] : 0, i))
1435             break; // no expected value function provided
1436 
1437         if (expected != destItems[i])
1438         {
1439             std::stringstream logLine;
1440             logLine << "ERROR: Result " << i
1441                     << " from kernel does not validate! (should be " << expected
1442                     << ", was " << destItems[i] << ")\n";
1443             log_error("%s", logLine.str().c_str());
1444             for (i = 0; i < threadCount; i++)
1445             {
1446                 logLine.str("");
1447                 logLine << " --- " << i << " - ";
1448                 if (startRefValues.size())
1449                     logLine << startRefValues[i] << " -> " << refValues[i];
1450                 else
1451                     logLine << refValues[i];
1452                 logLine << " --- ";
1453                 if (i < numDestItems) logLine << destItems[i];
1454                 logLine << "\n";
1455                 log_info("%s", logLine.str().c_str());
1456             }
1457             if (!gDebug)
1458             {
1459                 log_info("Program source:\n");
1460                 log_info("%s\n", programLine);
1461             }
1462             return -1;
1463         }
1464         dataVerified = true;
1465     }
1466 
1467     bool dataCorrect = false;
1468     /* Use the verify function (if provided) to also check the results */
1469     if (VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
1470     {
1471         if (!dataCorrect)
1472         {
1473             log_error("ERROR: Reference values did not validate!\n");
1474             std::stringstream logLine;
1475             for (cl_uint i = 0; i < threadCount; i++)
1476                 for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
1477                 {
1478                     logLine.str("");
1479                     logLine
1480                         << " --- " << i << " - "
1481                         << refValues[i * NumNonAtomicVariablesPerThread() + j]
1482                         << " --- ";
1483                     if (j == 0 && i < numDestItems) logLine << destItems[i];
1484                     logLine << "\n";
1485                     log_info("%s", logLine.str().c_str());
1486                 }
1487             if (!gDebug)
1488             {
1489                 log_info("Program source:\n");
1490                 log_info("%s\n", programLine);
1491             }
1492             return -1;
1493         }
1494     }
1495     else if (!dataVerified)
1496     {
1497         log_error("ERROR: Test doesn't check total or refs; no values are "
1498                   "verified!\n");
1499         return -1;
1500     }
1501 
1502     if (OldValueCheck()
1503         && !(DeclaredInProgram()
1504              && !LocalMemory())) // don't test for programs scope global atomics
1505                                  // 'old' value has been overwritten by previous
1506                                  // clEnqueueNDRangeKernel
1507     {
1508         /* Re-write the starting value */
1509         for (size_t i = 0; i < numDestItems; i++) destItems[i] = _startValue;
1510         refValues[0] = 0;
1511         if (deviceThreadCount > 0)
1512         {
1513             error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0,
1514                                          typeSize * numDestItems, &destItems[0],
1515                                          0, NULL, NULL);
1516             test_error(error, "Unable to write starting values!");
1517 
1518             /* Run the kernel once for a single thread, so we can verify that
1519              * the returned value is the original one */
1520             threadNum[0] = 1;
1521             error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum,
1522                                            threadNum, 0, NULL, NULL);
1523             test_error(error, "Unable to execute test kernel");
1524 
1525             error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize,
1526                                         &refValues[0], 0, NULL, NULL);
1527             test_error(error, "Unable to read reference values!");
1528         }
1529         else
1530         {
1531             /* Start host thread */
1532             HostFunction(0, 1, &destItems[0], &refValues[0]);
1533         }
1534 
1535         if (refValues[0] != _startValue) // destItems[0])
1536         {
1537             std::stringstream logLine;
1538             logLine << "ERROR: atomic function operated correctly but did NOT "
1539                        "return correct 'old' value "
1540                        " (should have been "
1541                     << destItems[0] << ", returned " << refValues[0] << ")!\n";
1542             log_error("%s", logLine.str().c_str());
1543             if (!gDebug)
1544             {
1545                 log_info("Program source:\n");
1546                 log_info("%s\n", programLine);
1547             }
1548             return -1;
1549         }
1550     }
1551     if (UseSVM())
1552     {
1553         // the buffer object must first be released before the SVM buffer is
1554         // freed. The Wrapper Class method reset() will do that
1555         streams[0].reset();
1556         if (gUseHostPtr)
1557             free(svmAtomicBuffer);
1558         else
1559             clSVMFree(context, svmAtomicBuffer);
1560         streams[1].reset();
1561         if (gUseHostPtr)
1562             free(svmDataBuffer);
1563         else
1564             clSVMFree(context, svmDataBuffer);
1565     }
1566     _passCount++;
1567     return 0;
1568 }
1569 
1570 #endif // COMMON_H_
1571