1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #ifndef COMMON_H_
17 #define COMMON_H_
18
19 #include "harness/testHarness.h"
20 #include "harness/typeWrappers.h"
21 #include "harness/ThreadPool.h"
22
23 #include "host_atomics.h"
24
25 #include <vector>
26 #include <sstream>
27
28 #define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads)
29 #define MAX_HOST_THREADS GetThreadCount()
30
31 #define EXECUTE_TEST(error, test) \
32 error |= test; \
33 if (error && !gContinueOnError) return error;
34
35 enum TExplicitAtomicType
36 {
37 TYPE_ATOMIC_INT,
38 TYPE_ATOMIC_UINT,
39 TYPE_ATOMIC_LONG,
40 TYPE_ATOMIC_ULONG,
41 TYPE_ATOMIC_FLOAT,
42 TYPE_ATOMIC_DOUBLE,
43 TYPE_ATOMIC_INTPTR_T,
44 TYPE_ATOMIC_UINTPTR_T,
45 TYPE_ATOMIC_SIZE_T,
46 TYPE_ATOMIC_PTRDIFF_T,
47 TYPE_ATOMIC_FLAG
48 };
49
50 enum TExplicitMemoryScopeType
51 {
52 MEMORY_SCOPE_EMPTY,
53 MEMORY_SCOPE_WORK_GROUP,
54 MEMORY_SCOPE_DEVICE,
55 MEMORY_SCOPE_ALL_DEVICES, // Alias for MEMORY_SCOPE_ALL_SVM_DEVICES
56 MEMORY_SCOPE_ALL_SVM_DEVICES
57 };
58
59 extern bool
60 gHost; // temporary flag for testing native host threads (test verification)
61 extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2)
62 extern bool gContinueOnError; // execute all cases even when errors detected
63 extern bool
64 gNoGlobalVariables; // disable cases with global atomics in program scope
65 extern bool gNoGenericAddressSpace; // disable cases with generic address space
66 extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree
67 extern bool gDebug; // print OpenCL kernel code
68 extern int gInternalIterations; // internal test iterations for atomic
69 // operation, sufficient to verify atomicity
70 extern int
71 gMaxDeviceThreads; // maximum number of threads executed on OCL device
72 extern cl_device_atomic_capabilities gAtomicMemCap,
73 gAtomicFenceCap; // atomic memory and fence capabilities for this device
74
75 extern const char *
76 get_memory_order_type_name(TExplicitMemoryOrderType orderType);
77 extern const char *
78 get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);
79
80 extern cl_int getSupportedMemoryOrdersAndScopes(
81 cl_device_id device, std::vector<TExplicitMemoryOrderType> &memoryOrders,
82 std::vector<TExplicitMemoryScopeType> &memoryScopes);
83
84 class AtomicTypeInfo {
85 public:
86 TExplicitAtomicType _type;
AtomicTypeInfo(TExplicitAtomicType type)87 AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
88 cl_uint Size(cl_device_id device);
89 const char *AtomicTypeName();
90 const char *RegularTypeName();
91 const char *AddSubOperandTypeName();
92 int IsSupported(cl_device_id device);
93 };
94
95 template <typename HostDataType>
96 class AtomicTypeExtendedInfo : public AtomicTypeInfo {
97 public:
AtomicTypeExtendedInfo(TExplicitAtomicType type)98 AtomicTypeExtendedInfo(TExplicitAtomicType type): AtomicTypeInfo(type) {}
99 HostDataType MinValue();
100 HostDataType MaxValue();
SpecialValue(cl_uchar x)101 HostDataType SpecialValue(cl_uchar x)
102 {
103 HostDataType tmp;
104 cl_uchar *ptr = (cl_uchar *)&tmp;
105 for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_uchar); i++)
106 ptr[i] = x;
107 return tmp;
108 }
SpecialValue(cl_ushort x)109 HostDataType SpecialValue(cl_ushort x)
110 {
111 HostDataType tmp;
112 cl_ushort *ptr = (cl_ushort *)&tmp;
113 for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_ushort); i++)
114 ptr[i] = x;
115 return tmp;
116 }
117 };
118
119 class CTest {
120 public:
121 virtual int Execute(cl_device_id deviceID, cl_context context,
122 cl_command_queue queue, int num_elements) = 0;
123 };
124
125 template <typename HostAtomicType, typename HostDataType>
126 class CBasicTest : CTest {
127 public:
128 typedef struct
129 {
130 CBasicTest *test;
131 cl_uint tid;
132 cl_uint threadCount;
133 volatile HostAtomicType *destMemory;
134 HostDataType *oldValues;
135 } THostThreadContext;
HostThreadFunction(cl_uint job_id,cl_uint thread_id,void * userInfo)136 static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id,
137 void *userInfo)
138 {
139 THostThreadContext *threadContext =
140 ((THostThreadContext *)userInfo) + job_id;
141 threadContext->test->HostFunction(
142 threadContext->tid, threadContext->threadCount,
143 threadContext->destMemory, threadContext->oldValues);
144 return 0;
145 }
CBasicTest(TExplicitAtomicType dataType,bool useSVM)146 CBasicTest(TExplicitAtomicType dataType, bool useSVM)
147 : CTest(), _maxDeviceThreads(MAX_DEVICE_THREADS), _dataType(dataType),
148 _useSVM(useSVM), _startValue(255), _localMemory(false),
149 _declaredInProgram(false), _usedInFunction(false),
150 _genericAddrSpace(false), _oldValueCheck(true),
151 _localRefValues(false), _maxGroupSize(0), _passCount(0),
152 _iterations(gInternalIterations)
153 {}
~CBasicTest()154 virtual ~CBasicTest()
155 {
156 if (_passCount)
157 log_info(" %u tests executed successfully for %s\n", _passCount,
158 DataType().AtomicTypeName());
159 }
NumResults(cl_uint threadCount,cl_device_id deviceID)160 virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
161 {
162 return 1;
163 }
NumNonAtomicVariablesPerThread()164 virtual cl_uint NumNonAtomicVariablesPerThread() { return 1; }
ExpectedValue(HostDataType & expected,cl_uint threadCount,HostDataType * startRefValues,cl_uint whichDestValue)165 virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
166 HostDataType *startRefValues,
167 cl_uint whichDestValue)
168 {
169 return false;
170 }
GenerateRefs(cl_uint threadCount,HostDataType * startRefValues,MTdata d)171 virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
172 MTdata d)
173 {
174 return false;
175 }
VerifyRefs(bool & correct,cl_uint threadCount,HostDataType * refValues,HostAtomicType * finalValues)176 virtual bool VerifyRefs(bool &correct, cl_uint threadCount,
177 HostDataType *refValues,
178 HostAtomicType *finalValues)
179 {
180 return false;
181 }
182 virtual std::string PragmaHeader(cl_device_id deviceID);
183 virtual std::string ProgramHeader(cl_uint maxNumDestItems);
184 virtual std::string FunctionCode();
185 virtual std::string KernelCode(cl_uint maxNumDestItems);
186 virtual std::string ProgramCore() = 0;
SingleTestName()187 virtual std::string SingleTestName()
188 {
189 std::string testName = LocalMemory() ? "local" : "global";
190 testName += " ";
191 testName += DataType().AtomicTypeName();
192 if (DeclaredInProgram())
193 {
194 testName += " declared in program";
195 }
196 if (DeclaredInProgram() && UsedInFunction()) testName += ",";
197 if (UsedInFunction())
198 {
199 testName += " used in ";
200 if (GenericAddrSpace()) testName += "generic ";
201 testName += "function";
202 }
203 return testName;
204 }
205 virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
206 cl_command_queue queue);
ExecuteForEachPointerType(cl_device_id deviceID,cl_context context,cl_command_queue queue)207 int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context,
208 cl_command_queue queue)
209 {
210 int error = 0;
211 UsedInFunction(false);
212 EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
213 UsedInFunction(true);
214 GenericAddrSpace(false);
215 EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
216 GenericAddrSpace(true);
217 EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
218 GenericAddrSpace(false);
219 return error;
220 }
ExecuteForEachDeclarationType(cl_device_id deviceID,cl_context context,cl_command_queue queue)221 int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context,
222 cl_command_queue queue)
223 {
224 int error = 0;
225 DeclaredInProgram(false);
226 EXECUTE_TEST(error,
227 ExecuteForEachPointerType(deviceID, context, queue));
228 if (!UseSVM())
229 {
230 DeclaredInProgram(true);
231 EXECUTE_TEST(error,
232 ExecuteForEachPointerType(deviceID, context, queue));
233 }
234 return error;
235 }
ExecuteForEachParameterSet(cl_device_id deviceID,cl_context context,cl_command_queue queue)236 virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
237 cl_context context,
238 cl_command_queue queue)
239 {
240 int error = 0;
241 if (_maxDeviceThreads > 0 && !UseSVM())
242 {
243 LocalMemory(true);
244 EXECUTE_TEST(
245 error, ExecuteForEachDeclarationType(deviceID, context, queue));
246 }
247 if (_maxDeviceThreads + MaxHostThreads() > 0)
248 {
249 LocalMemory(false);
250 EXECUTE_TEST(
251 error, ExecuteForEachDeclarationType(deviceID, context, queue));
252 }
253 return error;
254 }
Execute(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)255 virtual int Execute(cl_device_id deviceID, cl_context context,
256 cl_command_queue queue, int num_elements)
257 {
258 if (sizeof(HostAtomicType) != DataType().Size(deviceID))
259 {
260 log_info("Invalid test: Host atomic type size (%u) is different "
261 "than OpenCL type size (%u)\n",
262 (cl_uint)sizeof(HostAtomicType),
263 DataType().Size(deviceID));
264 return -1;
265 }
266 if (sizeof(HostAtomicType) != sizeof(HostDataType))
267 {
268 log_info("Invalid test: Host atomic type size (%u) is different "
269 "than corresponding type size (%u)\n",
270 (cl_uint)sizeof(HostAtomicType),
271 (cl_uint)sizeof(HostDataType));
272 return -1;
273 }
274 // Verify we can run first
275 if (UseSVM() && !gUseHostPtr)
276 {
277 cl_device_svm_capabilities caps;
278 cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES,
279 sizeof(caps), &caps, 0);
280 test_error(error, "clGetDeviceInfo failed");
281 if ((caps & CL_DEVICE_SVM_ATOMICS) == 0)
282 {
283 log_info("\t%s - SVM_ATOMICS not supported\n",
284 DataType().AtomicTypeName());
285 // implicit pass
286 return 0;
287 }
288 }
289 if (!DataType().IsSupported(deviceID))
290 {
291 log_info("\t%s not supported\n", DataType().AtomicTypeName());
292 // implicit pass or host test (debug feature)
293 if (UseSVM()) return 0;
294 _maxDeviceThreads = 0;
295 }
296 if (_maxDeviceThreads + MaxHostThreads() == 0) return 0;
297 return ExecuteForEachParameterSet(deviceID, context, queue);
298 }
HostFunction(cl_uint tid,cl_uint threadCount,volatile HostAtomicType * destMemory,HostDataType * oldValues)299 virtual void HostFunction(cl_uint tid, cl_uint threadCount,
300 volatile HostAtomicType *destMemory,
301 HostDataType *oldValues)
302 {
303 log_info("Empty thread function %u\n", (cl_uint)tid);
304 }
DataType()305 AtomicTypeExtendedInfo<HostDataType> DataType() const
306 {
307 return AtomicTypeExtendedInfo<HostDataType>(_dataType);
308 }
309 cl_uint _maxDeviceThreads;
MaxHostThreads()310 virtual cl_uint MaxHostThreads()
311 {
312 if (UseSVM() || gHost)
313 return MAX_HOST_THREADS;
314 else
315 return 0;
316 }
317
CheckCapabilities(TExplicitMemoryScopeType memoryScope,TExplicitMemoryOrderType memoryOrder)318 int CheckCapabilities(TExplicitMemoryScopeType memoryScope,
319 TExplicitMemoryOrderType memoryOrder)
320 {
321 /*
322 Differentiation between atomic fence and other atomic operations
323 does not need to occur here.
324
325 The initialisation of this test checks that the minimum required
326 capabilities are supported by this device.
327
328 The following switches allow the test to skip if optional
329 capabilites are not supported by the device.
330 */
331 switch (memoryScope)
332 {
333 case MEMORY_SCOPE_EMPTY: {
334 break;
335 }
336 case MEMORY_SCOPE_WORK_GROUP: {
337 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0)
338 {
339 return TEST_SKIPPED_ITSELF;
340 }
341 break;
342 }
343 case MEMORY_SCOPE_DEVICE: {
344 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
345 {
346 return TEST_SKIPPED_ITSELF;
347 }
348 break;
349 }
350 case MEMORY_SCOPE_ALL_DEVICES: // fallthough
351 case MEMORY_SCOPE_ALL_SVM_DEVICES: {
352 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0)
353 {
354 return TEST_SKIPPED_ITSELF;
355 }
356 break;
357 }
358 default: {
359 log_info("Invalid memory scope\n");
360 break;
361 }
362 }
363
364 switch (memoryOrder)
365 {
366 case MEMORY_ORDER_EMPTY: {
367 break;
368 }
369 case MEMORY_ORDER_RELAXED: {
370 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0)
371 {
372 return TEST_SKIPPED_ITSELF;
373 }
374 break;
375 }
376 case MEMORY_ORDER_ACQUIRE:
377 case MEMORY_ORDER_RELEASE:
378 case MEMORY_ORDER_ACQ_REL: {
379 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0)
380 {
381 return TEST_SKIPPED_ITSELF;
382 }
383 break;
384 }
385 case MEMORY_ORDER_SEQ_CST: {
386 if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0)
387 {
388 return TEST_SKIPPED_ITSELF;
389 }
390 break;
391 }
392 default: {
393 log_info("Invalid memory order\n");
394 break;
395 }
396 }
397
398 return 0;
399 }
SVMDataBufferAllSVMConsistent()400 virtual bool SVMDataBufferAllSVMConsistent() { return false; }
UseSVM()401 bool UseSVM() { return _useSVM; }
StartValue(HostDataType startValue)402 void StartValue(HostDataType startValue) { _startValue = startValue; }
StartValue()403 HostDataType StartValue() { return _startValue; }
LocalMemory(bool local)404 void LocalMemory(bool local) { _localMemory = local; }
LocalMemory()405 bool LocalMemory() { return _localMemory; }
DeclaredInProgram(bool declaredInProgram)406 void DeclaredInProgram(bool declaredInProgram)
407 {
408 _declaredInProgram = declaredInProgram;
409 }
DeclaredInProgram()410 bool DeclaredInProgram() { return _declaredInProgram; }
UsedInFunction(bool local)411 void UsedInFunction(bool local) { _usedInFunction = local; }
UsedInFunction()412 bool UsedInFunction() { return _usedInFunction; }
GenericAddrSpace(bool genericAddrSpace)413 void GenericAddrSpace(bool genericAddrSpace)
414 {
415 _genericAddrSpace = genericAddrSpace;
416 }
GenericAddrSpace()417 bool GenericAddrSpace() { return _genericAddrSpace; }
OldValueCheck(bool check)418 void OldValueCheck(bool check) { _oldValueCheck = check; }
OldValueCheck()419 bool OldValueCheck() { return _oldValueCheck; }
LocalRefValues(bool localRefValues)420 void LocalRefValues(bool localRefValues)
421 {
422 _localRefValues = localRefValues;
423 }
LocalRefValues()424 bool LocalRefValues() { return _localRefValues; }
MaxGroupSize(cl_uint maxGroupSize)425 void MaxGroupSize(cl_uint maxGroupSize) { _maxGroupSize = maxGroupSize; }
MaxGroupSize()426 cl_uint MaxGroupSize() { return _maxGroupSize; }
CurrentGroupSize(cl_uint currentGroupSize)427 void CurrentGroupSize(cl_uint currentGroupSize)
428 {
429 if (MaxGroupSize() && MaxGroupSize() < currentGroupSize)
430 _currentGroupSize = MaxGroupSize();
431 else
432 _currentGroupSize = currentGroupSize;
433 }
CurrentGroupSize()434 cl_uint CurrentGroupSize() { return _currentGroupSize; }
CurrentGroupNum(cl_uint threadCount)435 virtual cl_uint CurrentGroupNum(cl_uint threadCount)
436 {
437 if (threadCount == 0) return 0;
438 if (LocalMemory()) return 1;
439 return threadCount / CurrentGroupSize();
440 }
Iterations()441 cl_int Iterations() { return _iterations; }
IterationsStr()442 std::string IterationsStr()
443 {
444 std::stringstream ss;
445 ss << _iterations;
446 return ss.str();
447 }
448
449 private:
450 const TExplicitAtomicType _dataType;
451 const bool _useSVM;
452 HostDataType _startValue;
453 bool _localMemory;
454 bool _declaredInProgram;
455 bool _usedInFunction;
456 bool _genericAddrSpace;
457 bool _oldValueCheck;
458 bool _localRefValues;
459 cl_uint _maxGroupSize;
460 cl_uint _currentGroupSize;
461 cl_uint _passCount;
462 const cl_int _iterations;
463 };
464
465 template <typename HostAtomicType, typename HostDataType>
466 class CBasicTestMemOrderScope
467 : public CBasicTest<HostAtomicType, HostDataType> {
468 public:
469 using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
470 using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
471 using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
472 CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false)
473 : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
474 {}
ProgramHeader(cl_uint maxNumDestItems)475 virtual std::string ProgramHeader(cl_uint maxNumDestItems)
476 {
477 std::string header;
478 if (gOldAPI)
479 {
480 std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
481 header += "#define atomic_store_explicit(x,y,o" + s
482 + ") atomic_store(x,y)\n"
483 "#define atomic_load_explicit(x,o"
484 + s
485 + ") atomic_load(x)\n"
486 "#define atomic_exchange_explicit(x,y,o"
487 + s
488 + ") atomic_exchange(x,y)\n"
489 "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"
490 + s
491 + ") atomic_compare_exchange_strong(x,y,z)\n"
492 "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"
493 + s
494 + ") atomic_compare_exchange_weak(x,y,z)\n"
495 "#define atomic_fetch_add_explicit(x,y,o"
496 + s
497 + ") atomic_fetch_add(x,y)\n"
498 "#define atomic_fetch_sub_explicit(x,y,o"
499 + s
500 + ") atomic_fetch_sub(x,y)\n"
501 "#define atomic_fetch_or_explicit(x,y,o"
502 + s
503 + ") atomic_fetch_or(x,y)\n"
504 "#define atomic_fetch_xor_explicit(x,y,o"
505 + s
506 + ") atomic_fetch_xor(x,y)\n"
507 "#define atomic_fetch_and_explicit(x,y,o"
508 + s
509 + ") atomic_fetch_and(x,y)\n"
510 "#define atomic_fetch_min_explicit(x,y,o"
511 + s
512 + ") atomic_fetch_min(x,y)\n"
513 "#define atomic_fetch_max_explicit(x,y,o"
514 + s
515 + ") atomic_fetch_max(x,y)\n"
516 "#define atomic_flag_test_and_set_explicit(x,o"
517 + s
518 + ") atomic_flag_test_and_set(x)\n"
519 "#define atomic_flag_clear_explicit(x,o"
520 + s + ") atomic_flag_clear(x)\n";
521 }
522 return header
523 + CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(
524 maxNumDestItems);
525 }
SingleTestName()526 virtual std::string SingleTestName()
527 {
528 std::string testName =
529 CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
530 if (MemoryOrder() != MEMORY_ORDER_EMPTY)
531 {
532 testName += std::string(", ")
533 + std::string(get_memory_order_type_name(MemoryOrder()))
534 .substr(sizeof("memory"));
535 }
536 if (MemoryScope() != MEMORY_SCOPE_EMPTY)
537 {
538 testName += std::string(", ")
539 + std::string(get_memory_scope_type_name(MemoryScope()))
540 .substr(sizeof("memory"));
541 }
542 return testName;
543 }
ExecuteSingleTest(cl_device_id deviceID,cl_context context,cl_command_queue queue)544 virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
545 cl_command_queue queue)
546 {
547 if (LocalMemory() && MemoryScope() != MEMORY_SCOPE_EMPTY
548 && MemoryScope()
549 != MEMORY_SCOPE_WORK_GROUP) // memory scope should only be used
550 // for global memory
551 return 0;
552 if (MemoryScope() == MEMORY_SCOPE_DEVICE)
553 MaxGroupSize(
554 16); // increase number of groups by forcing smaller group size
555 else
556 MaxGroupSize(0); // group size limited by device capabilities
557
558 if (CheckCapabilities(MemoryScope(), MemoryOrder())
559 == TEST_SKIPPED_ITSELF)
560 return 0; // skip test - not applicable
561
562 return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
563 deviceID, context, queue);
564 }
ExecuteForEachParameterSet(cl_device_id deviceID,cl_context context,cl_command_queue queue)565 virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
566 cl_context context,
567 cl_command_queue queue)
568 {
569 // repeat test for each reasonable memory order/scope combination
570 std::vector<TExplicitMemoryOrderType> memoryOrder;
571 std::vector<TExplicitMemoryScopeType> memoryScope;
572 int error = 0;
573
574 // For OpenCL-3.0 and later some orderings and scopes are optional, so
575 // here we query for the supported ones.
576 test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder,
577 memoryScope),
578 "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
579
580 for (unsigned oi = 0; oi < memoryOrder.size(); oi++)
581 {
582 for (unsigned si = 0; si < memoryScope.size(); si++)
583 {
584 if (memoryOrder[oi] == MEMORY_ORDER_EMPTY
585 && memoryScope[si] != MEMORY_SCOPE_EMPTY)
586 continue;
587 MemoryOrder(memoryOrder[oi]);
588 MemoryScope(memoryScope[si]);
589 EXECUTE_TEST(
590 error,
591 (CBasicTest<HostAtomicType, HostDataType>::
592 ExecuteForEachParameterSet(deviceID, context, queue)));
593 }
594 }
595 return error;
596 }
MemoryOrder(TExplicitMemoryOrderType memoryOrder)597 void MemoryOrder(TExplicitMemoryOrderType memoryOrder)
598 {
599 _memoryOrder = memoryOrder;
600 }
MemoryOrder()601 TExplicitMemoryOrderType MemoryOrder() { return _memoryOrder; }
MemoryOrderStr()602 std::string MemoryOrderStr()
603 {
604 if (MemoryOrder() != MEMORY_ORDER_EMPTY)
605 return std::string(", ")
606 + get_memory_order_type_name(MemoryOrder());
607 return "";
608 }
MemoryScope(TExplicitMemoryScopeType memoryScope)609 void MemoryScope(TExplicitMemoryScopeType memoryScope)
610 {
611 _memoryScope = memoryScope;
612 }
MemoryScope()613 TExplicitMemoryScopeType MemoryScope() { return _memoryScope; }
MemoryScopeStr()614 std::string MemoryScopeStr()
615 {
616 if (MemoryScope() != MEMORY_SCOPE_EMPTY)
617 return std::string(", ")
618 + get_memory_scope_type_name(MemoryScope());
619 return "";
620 }
MemoryOrderScopeStr()621 std::string MemoryOrderScopeStr()
622 {
623 return MemoryOrderStr() + MemoryScopeStr();
624 }
CurrentGroupNum(cl_uint threadCount)625 virtual cl_uint CurrentGroupNum(cl_uint threadCount)
626 {
627 if (MemoryScope() == MEMORY_SCOPE_WORK_GROUP) return 1;
628 return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(
629 threadCount);
630 }
MaxHostThreads()631 virtual cl_uint MaxHostThreads()
632 {
633 // block host threads execution for memory scope different than
634 // memory_scope_all_svm_devices
635 if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
636 || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
637 {
638 return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
639 }
640 else
641 {
642 return 0;
643 }
644 }
645
646 private:
647 TExplicitMemoryOrderType _memoryOrder;
648 TExplicitMemoryScopeType _memoryScope;
649 };
650
651 template <typename HostAtomicType, typename HostDataType>
652 class CBasicTestMemOrder2Scope
653 : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
654 public:
655 using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
656 using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
657 using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
658 using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
659 using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
660 using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
661
662 CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false)
663 : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
664 useSVM)
665 {}
SingleTestName()666 virtual std::string SingleTestName()
667 {
668 std::string testName =
669 CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
670 if (MemoryOrder() != MEMORY_ORDER_EMPTY)
671 testName += std::string(", ")
672 + std::string(get_memory_order_type_name(MemoryOrder()))
673 .substr(sizeof("memory"));
674 if (MemoryOrder2() != MEMORY_ORDER_EMPTY)
675 testName += std::string(", ")
676 + std::string(get_memory_order_type_name(MemoryOrder2()))
677 .substr(sizeof("memory"));
678 if (MemoryScope() != MEMORY_SCOPE_EMPTY)
679 testName += std::string(", ")
680 + std::string(get_memory_scope_type_name(MemoryScope()))
681 .substr(sizeof("memory"));
682 return testName;
683 }
ExecuteForEachParameterSet(cl_device_id deviceID,cl_context context,cl_command_queue queue)684 virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
685 cl_context context,
686 cl_command_queue queue)
687 {
688 // repeat test for each reasonable memory order/scope combination
689 std::vector<TExplicitMemoryOrderType> memoryOrder;
690 std::vector<TExplicitMemoryScopeType> memoryScope;
691 int error = 0;
692
693 // For OpenCL-3.0 and later some orderings and scopes are optional, so
694 // here we query for the supported ones.
695 test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder,
696 memoryScope),
697 "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
698
699 for (unsigned oi = 0; oi < memoryOrder.size(); oi++)
700 {
701 for (unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
702 {
703 for (unsigned si = 0; si < memoryScope.size(); si++)
704 {
705 if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY
706 || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
707 && memoryOrder[oi] != memoryOrder[o2i])
708 continue; // both memory order arguments must be set (or
709 // none)
710 if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY
711 || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
712 && memoryScope[si] != MEMORY_SCOPE_EMPTY)
713 continue; // memory scope without memory order is not
714 // allowed
715 MemoryOrder(memoryOrder[oi]);
716 MemoryOrder2(memoryOrder[o2i]);
717 MemoryScope(memoryScope[si]);
718
719 if (CheckCapabilities(MemoryScope(), MemoryOrder())
720 == TEST_SKIPPED_ITSELF)
721 continue; // skip test - not applicable
722
723 if (CheckCapabilities(MemoryScope(), MemoryOrder2())
724 == TEST_SKIPPED_ITSELF)
725 continue; // skip test - not applicable
726
727 EXECUTE_TEST(error,
728 (CBasicTest<HostAtomicType, HostDataType>::
729 ExecuteForEachParameterSet(
730 deviceID, context, queue)));
731 }
732 }
733 }
734 return error;
735 }
MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail)736 void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail)
737 {
738 _memoryOrder2 = memoryOrderFail;
739 }
MemoryOrder2()740 TExplicitMemoryOrderType MemoryOrder2() { return _memoryOrder2; }
MemoryOrderFailStr()741 std::string MemoryOrderFailStr()
742 {
743 if (MemoryOrder2() != MEMORY_ORDER_EMPTY)
744 return std::string(", ")
745 + get_memory_order_type_name(MemoryOrder2());
746 return "";
747 }
MemoryOrderScope()748 std::string MemoryOrderScope()
749 {
750 return MemoryOrderStr() + MemoryOrderFailStr() + MemoryScopeStr();
751 }
752
753 private:
754 TExplicitMemoryOrderType _memoryOrder2;
755 };
756
757 template <typename HostAtomicType, typename HostDataType>
758 std::string
PragmaHeader(cl_device_id deviceID)759 CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
760 {
761 std::string pragma;
762
763 if (gOldAPI)
764 {
765 pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : "
766 "enable\n";
767 pragma += "#pragma OPENCL EXTENSION "
768 "cl_khr_local_int32_extended_atomics : enable\n";
769 pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : "
770 "enable\n";
771 pragma += "#pragma OPENCL EXTENSION "
772 "cl_khr_global_int32_extended_atomics : enable\n";
773 }
774 // Create the pragma lines for this kernel
775 if (DataType().Size(deviceID) == 8)
776 {
777 pragma +=
778 "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
779 pragma +=
780 "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
781 }
782 if (_dataType == TYPE_ATOMIC_DOUBLE)
783 pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
784 return pragma;
785 }
786
787 template <typename HostAtomicType, typename HostDataType>
788 std::string
ProgramHeader(cl_uint maxNumDestItems)789 CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
790 {
791 // Create the program header
792 std::string header;
793 std::string aTypeName = DataType().AtomicTypeName();
794 std::string cTypeName = DataType().RegularTypeName();
795 std::string argListForKernel;
796 std::string argListForFunction;
797 std::string argListNoTypes;
798 std::string functionPrototype;
799 std::string addressSpace = LocalMemory() ? "__local " : "__global ";
800
801 if (gOldAPI)
802 {
803 header += std::string("#define ") + aTypeName + " " + cTypeName
804 + "\n"
805 "#define atomic_store(x,y) (*(x) "
806 "= y)\n"
807 "#define atomic_load(x) "
808 "(*(x))\n"
809 "#define ATOMIC_VAR_INIT(x) (x)\n"
810 "#define ATOMIC_FLAG_INIT 0\n"
811 "#define atomic_init(x,y) "
812 "atomic_store(x,y)\n";
813 if (aTypeName == "atomic_float")
814 header += "#define atomic_exchange(x,y) "
815 " atomic_xchg(x,y)\n";
816 else if (aTypeName == "atomic_double")
817 header += "double atomic_exchange(volatile " + addressSpace
818 + "atomic_double *x, double y)\n"
819 "{\n"
820 " long tmp = *(long*)&y, res;\n"
821 " volatile "
822 + addressSpace + "long *tmpA = (volatile " + addressSpace
823 + "long)x;\n"
824 " res = atom_xchg(tmpA,tmp);\n"
825 " return *(double*)&res;\n"
826 "}\n";
827 else
828 header += "#define atomic_exchange(x,y) "
829 " atom_xchg(x,y)\n";
830 if (aTypeName != "atomic_float" && aTypeName != "atomic_double")
831 header += "bool atomic_compare_exchange_strong(volatile "
832 + addressSpace + " " + aTypeName + " *a, " + cTypeName
833 + " *expected, " + cTypeName
834 + " desired)\n"
835 "{\n"
836 " "
837 + cTypeName
838 + " old = atom_cmpxchg(a, *expected, desired);\n"
839 " if(old == *expected)\n"
840 " return true;\n"
841 " *expected = old;\n"
842 " return false;\n"
843 "}\n"
844 "#define atomic_compare_exchange_weak "
845 "atomic_compare_exchange_strong\n";
846 header += "#define atomic_fetch_add(x,y) "
847 "atom_add(x,y)\n"
848 "#define atomic_fetch_sub(x,y) "
849 "atom_sub(x,y)\n"
850 "#define atomic_fetch_or(x,y) "
851 "atom_or(x,y)\n"
852 "#define atomic_fetch_xor(x,y) "
853 "atom_xor(x,y)\n"
854 "#define atomic_fetch_and(x,y) "
855 "atom_and(x,y)\n"
856 "#define atomic_fetch_min(x,y) "
857 "atom_min(x,y)\n"
858 "#define atomic_fetch_max(x,y) "
859 "atom_max(x,y)\n"
860 "#define atomic_flag_test_and_set(x) "
861 "atomic_exchange(x,1)\n"
862 "#define atomic_flag_clear(x) "
863 "atomic_store(x,0)\n"
864 "\n";
865 }
866 if (!LocalMemory() && DeclaredInProgram())
867 {
868 // additional atomic variable for results copying (last thread will do
869 // this)
870 header += "__global volatile atomic_uint finishedThreads = "
871 "ATOMIC_VAR_INIT(0);\n";
872 // atomic variables declared in program scope - test data
873 std::stringstream ss;
874 ss << maxNumDestItems;
875 header += std::string("__global volatile ") + aTypeName + " destMemory["
876 + ss.str() + "] = {\n";
877 ss.str("");
878 ss << _startValue;
879 for (cl_uint i = 0; i < maxNumDestItems; i++)
880 {
881 if (aTypeName == "atomic_flag")
882 header += " ATOMIC_FLAG_INIT";
883 else
884 header += " ATOMIC_VAR_INIT(" + ss.str() + ")";
885 if (i + 1 < maxNumDestItems) header += ",";
886 header += "\n";
887 }
888 header += "};\n"
889 "\n";
890 }
891 return header;
892 }
893
894 template <typename HostAtomicType, typename HostDataType>
FunctionCode()895 std::string CBasicTest<HostAtomicType, HostDataType>::FunctionCode()
896 {
897 if (!UsedInFunction()) return "";
898 std::string addressSpace = LocalMemory() ? "__local " : "__global ";
899 std::string code = "void test_atomic_function(uint tid, uint threadCount, "
900 "uint numDestItems, volatile ";
901 if (!GenericAddrSpace()) code += addressSpace;
902 code += std::string(DataType().AtomicTypeName()) + " *destMemory, __global "
903 + DataType().RegularTypeName() + " *oldValues";
904 if (LocalRefValues())
905 code += std::string(", __local ") + DataType().RegularTypeName()
906 + " *localValues";
907 code += ")\n"
908 "{\n";
909 code += ProgramCore();
910 code += "}\n"
911 "\n";
912 return code;
913 }
914
915 template <typename HostAtomicType, typename HostDataType>
916 std::string
KernelCode(cl_uint maxNumDestItems)917 CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
918 {
919 std::string aTypeName = DataType().AtomicTypeName();
920 std::string cTypeName = DataType().RegularTypeName();
921 std::string addressSpace = LocalMemory() ? "__local " : "__global ";
922 std::string code = "__kernel void test_atomic_kernel(uint threadCount, "
923 "uint numDestItems, ";
924
925 // prepare list of arguments for kernel
926 if (LocalMemory())
927 {
928 code += std::string("__global ") + cTypeName + " *finalDest, __global "
929 + cTypeName
930 + " *oldValues,"
931 " volatile "
932 + addressSpace + aTypeName + " *"
933 + (DeclaredInProgram() ? "notUsed" : "") + "destMemory";
934 }
935 else
936 {
937 code += "volatile " + addressSpace
938 + (DeclaredInProgram() ? (cTypeName + " *finalDest")
939 : (aTypeName + " *destMemory"))
940 + ", __global " + cTypeName + " *oldValues";
941 }
942 if (LocalRefValues())
943 code += std::string(", __local ") + cTypeName + " *localValues";
944 code += ")\n"
945 "{\n";
946 if (LocalMemory() && DeclaredInProgram())
947 {
948 // local atomics declared in kernel scope
949 std::stringstream ss;
950 ss << maxNumDestItems;
951 code += std::string(" __local volatile ") + aTypeName + " destMemory["
952 + ss.str() + "];\n";
953 }
954 code += " uint tid = get_global_id(0);\n"
955 "\n";
956 if (LocalMemory())
957 {
958 // memory_order_relaxed is sufficient for these initialization
959 // operations as the barrier below will act as a fence, providing an
960 // order to the operations. memory_scope_work_group is sufficient as
961 // local memory is only visible within the work-group.
962 code += R"(
963 // initialize atomics not reachable from host (first thread
964 // is doing this, other threads are waiting on barrier)
965 if(get_local_id(0) == 0)
966 for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)
967 {)";
968 if (aTypeName == "atomic_flag")
969 {
970 code += R"(
971 if(finalDest[dstItemIdx])
972 atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
973 memory_order_relaxed,
974 memory_scope_work_group);
975 else
976 atomic_flag_clear_explicit(destMemory+dstItemIdx,
977 memory_order_relaxed,
978 memory_scope_work_group);)";
979 }
980 else
981 {
982 code += R"(
983 atomic_store_explicit(destMemory+dstItemIdx,
984 finalDest[dstItemIdx],
985 memory_order_relaxed,
986 memory_scope_work_group);)";
987 }
988 code += " }\n"
989 " barrier(CLK_LOCAL_MEM_FENCE);\n"
990 "\n";
991 }
992 if (LocalRefValues())
993 {
994 code += " // Copy input reference values into local memory\n";
995 if (NumNonAtomicVariablesPerThread() == 1)
996 code += " localValues[get_local_id(0)] = oldValues[tid];\n";
997 else
998 {
999 std::stringstream ss;
1000 ss << NumNonAtomicVariablesPerThread();
1001 code += " for(uint rfId = 0; rfId < " + ss.str()
1002 + "; rfId++)\n"
1003 " localValues[get_local_id(0)*"
1004 + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
1005 }
1006 code += " barrier(CLK_LOCAL_MEM_FENCE);\n"
1007 "\n";
1008 }
1009 if (UsedInFunction())
1010 code += std::string(" test_atomic_function(tid, threadCount, "
1011 "numDestItems, destMemory, oldValues")
1012 + (LocalRefValues() ? ", localValues" : "") + ");\n";
1013 else
1014 code += ProgramCore();
1015 code += "\n";
1016 if (LocalRefValues())
1017 {
1018 code += " // Copy local reference values into output array\n"
1019 " barrier(CLK_LOCAL_MEM_FENCE);\n";
1020 if (NumNonAtomicVariablesPerThread() == 1)
1021 code += " oldValues[tid] = localValues[get_local_id(0)];\n";
1022 else
1023 {
1024 std::stringstream ss;
1025 ss << NumNonAtomicVariablesPerThread();
1026 code += " for(uint rfId = 0; rfId < " + ss.str()
1027 + "; rfId++)\n"
1028 " oldValues[tid*"
1029 + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str()
1030 + "+rfId];\n";
1031 }
1032 code += "\n";
1033 }
1034 if (LocalMemory())
1035 {
1036 code += " // Copy final values to host reachable buffer\n";
1037 code += " barrier(CLK_LOCAL_MEM_FENCE);\n"
1038 " if(get_local_id(0) == 0) // first thread in workgroup\n";
1039 code += " for(uint dstItemIdx = 0; dstItemIdx < numDestItems; "
1040 "dstItemIdx++)\n";
1041 if (aTypeName == "atomic_flag")
1042 {
1043 code += R"(
1044 finalDest[dstItemIdx] =
1045 atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
1046 memory_order_relaxed,
1047 memory_scope_work_group);)";
1048 }
1049 else
1050 {
1051 code += R"(
1052 finalDest[dstItemIdx] =
1053 atomic_load_explicit(destMemory+dstItemIdx,
1054 memory_order_relaxed,
1055 memory_scope_work_group);)";
1056 }
1057 }
1058 else if (DeclaredInProgram())
1059 {
1060 // global atomics declared in program scope
1061 code += " // Copy final values to host reachable buffer\n";
1062 code += R"(
1063 if(atomic_fetch_add_explicit(&finishedThreads, 1u,
1064 memory_order_acq_rel,
1065 memory_scope_device)
1066 == get_global_size(0)-1) // last finished thread
1067 )";
1068 code += " for(uint dstItemIdx = 0; dstItemIdx < numDestItems; "
1069 "dstItemIdx++)\n";
1070 if (aTypeName == "atomic_flag")
1071 {
1072 code += R"(
1073 finalDest[dstItemIdx] =
1074 atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
1075 memory_order_relaxed,
1076 memory_scope_device);)";
1077 }
1078 else
1079 {
1080 code += R"(
1081 finalDest[dstItemIdx] =
1082 atomic_load_explicit(destMemory+dstItemIdx,
1083 memory_order_relaxed,
1084 memory_scope_device);)";
1085 }
1086 }
1087 code += "}\n"
1088 "\n";
1089 return code;
1090 }
1091
1092 template <typename HostAtomicType, typename HostDataType>
ExecuteSingleTest(cl_device_id deviceID,cl_context context,cl_command_queue queue)1093 int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
1094 cl_device_id deviceID, cl_context context, cl_command_queue queue)
1095 {
1096 int error;
1097 clProgramWrapper program;
1098 clKernelWrapper kernel;
1099 size_t threadNum[1];
1100 clMemWrapper streams[2];
1101 std::vector<HostAtomicType> destItems;
1102 HostAtomicType *svmAtomicBuffer = 0;
1103 std::vector<HostDataType> refValues, startRefValues;
1104 HostDataType *svmDataBuffer = 0;
1105 cl_uint deviceThreadCount, hostThreadCount, threadCount;
1106 size_t groupSize = 0;
1107 std::string programSource;
1108 const char *programLine;
1109 MTdata d;
1110 size_t typeSize = DataType().Size(deviceID);
1111
1112 deviceThreadCount = _maxDeviceThreads;
1113 hostThreadCount = MaxHostThreads();
1114 threadCount = deviceThreadCount + hostThreadCount;
1115
1116 // log_info("\t%s %s%s...\n", local ? "local" : "global",
1117 // DataType().AtomicTypeName(), memoryOrderScope.c_str());
1118 log_info("\t%s...\n", SingleTestName().c_str());
1119
1120 if (!LocalMemory() && DeclaredInProgram()
1121 && gNoGlobalVariables) // no support for program scope global variables
1122 {
1123 log_info("\t\tTest disabled\n");
1124 return 0;
1125 }
1126 if (UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
1127 {
1128 log_info("\t\tTest disabled\n");
1129 return 0;
1130 }
1131 if (!LocalMemory() && DeclaredInProgram())
1132 {
1133 if (((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
1134 || ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0))
1135 {
1136 log_info("\t\tTest disabled\n");
1137 return 0;
1138 }
1139 }
1140
1141 // set up work sizes based on device capabilities and test configuration
1142 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
1143 sizeof(groupSize), &groupSize, NULL);
1144 test_error(error, "Unable to obtain max work group size for device");
1145 CurrentGroupSize((cl_uint)groupSize);
1146 if (CurrentGroupSize() > deviceThreadCount)
1147 CurrentGroupSize(deviceThreadCount);
1148 if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
1149 deviceThreadCount =
1150 CurrentGroupSize() * CurrentGroupNum(deviceThreadCount);
1151 threadCount = deviceThreadCount + hostThreadCount;
1152
1153 // If we're given a num_results function, we need to determine how many
1154 // result objects we need. This is the first assessment for current maximum
1155 // number of threads (exact thread count is not known here)
1156 // - needed for program source code generation (arrays of atomics declared
1157 // in program)
1158 cl_uint numDestItems = NumResults(threadCount, deviceID);
1159
1160 if (deviceThreadCount > 0)
1161 {
1162 // This loop iteratively reduces the workgroup size by 2 and then
1163 // re-generates the kernel with the reduced
1164 // workgroup size until we find a size which is admissible for the
1165 // kernel being run or reduce the wg size to the trivial case of 1
1166 // (which was separately verified to be accurate for the kernel being
1167 // run)
1168
1169 while ((CurrentGroupSize() > 1))
1170 {
1171 // Re-generate the kernel code with the current group size
1172 if (kernel) clReleaseKernel(kernel);
1173 if (program) clReleaseProgram(program);
1174 programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
1175 + FunctionCode() + KernelCode(numDestItems);
1176 programLine = programSource.c_str();
1177 if (create_single_kernel_helper_with_build_options(
1178 context, &program, &kernel, 1, &programLine,
1179 "test_atomic_kernel", gOldAPI ? "" : nullptr))
1180 {
1181 return -1;
1182 }
1183 // Get work group size for the new kernel
1184 error = clGetKernelWorkGroupInfo(
1185 kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize),
1186 &groupSize, NULL);
1187 test_error(error,
1188 "Unable to obtain max work group size for device and "
1189 "kernel combo");
1190
1191 if (LocalMemory())
1192 {
1193 cl_ulong usedLocalMemory;
1194 cl_ulong totalLocalMemory;
1195 cl_uint maxWorkGroupSize;
1196
1197 error = clGetKernelWorkGroupInfo(
1198 kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
1199 sizeof(usedLocalMemory), &usedLocalMemory, NULL);
1200 test_error(error, "clGetKernelWorkGroupInfo failed");
1201
1202 error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
1203 sizeof(totalLocalMemory),
1204 &totalLocalMemory, NULL);
1205 test_error(error, "clGetDeviceInfo failed");
1206
1207 // We know that each work-group is going to use typeSize *
1208 // deviceThreadCount bytes of local memory
1209 // so pick the maximum value for deviceThreadCount that uses all
1210 // the local memory.
1211 maxWorkGroupSize =
1212 ((totalLocalMemory - usedLocalMemory) / typeSize);
1213
1214 if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
1215 }
1216 if (CurrentGroupSize() <= groupSize)
1217 break;
1218 else
1219 CurrentGroupSize(CurrentGroupSize() / 2);
1220 }
1221 if (CurrentGroupSize() > deviceThreadCount)
1222 CurrentGroupSize(deviceThreadCount);
1223 if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
1224 deviceThreadCount =
1225 CurrentGroupSize() * CurrentGroupNum(deviceThreadCount);
1226 threadCount = deviceThreadCount + hostThreadCount;
1227 }
1228 if (gDebug)
1229 {
1230 log_info("Program source:\n");
1231 log_info("%s\n", programLine);
1232 }
1233 if (deviceThreadCount > 0)
1234 log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount,
1235 CurrentGroupSize());
1236 if (hostThreadCount > 0)
1237 log_info("\t\t(host threads %u)\n", hostThreadCount);
1238
1239 refValues.resize(threadCount * NumNonAtomicVariablesPerThread());
1240
1241 // Generate ref data if we have a ref generator provided
1242 d = init_genrand(gRandomSeed);
1243 startRefValues.resize(threadCount * NumNonAtomicVariablesPerThread());
1244 if (GenerateRefs(threadCount, &startRefValues[0], d))
1245 {
1246 // copy ref values for host threads
1247 memcpy(&refValues[0], &startRefValues[0],
1248 sizeof(HostDataType) * threadCount
1249 * NumNonAtomicVariablesPerThread());
1250 }
1251 else
1252 {
1253 startRefValues.resize(0);
1254 }
1255 free_mtdata(d);
1256 d = NULL;
1257
1258 // If we're given a num_results function, we need to determine how many
1259 // result objects we need. If we don't have it, we assume it's just 1 This
1260 // is final value (exact thread count is known in this place)
1261 numDestItems = NumResults(threadCount, deviceID);
1262
1263 destItems.resize(numDestItems);
1264 for (cl_uint i = 0; i < numDestItems; i++) destItems[i] = _startValue;
1265
1266 // Create main buffer with atomic variables (array size dependent on
1267 // particular test)
1268 if (UseSVM())
1269 {
1270 if (gUseHostPtr)
1271 svmAtomicBuffer = (HostAtomicType *)malloc(typeSize * numDestItems);
1272 else
1273 svmAtomicBuffer = (HostAtomicType *)clSVMAlloc(
1274 context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
1275 typeSize * numDestItems, 0);
1276 if (!svmAtomicBuffer)
1277 {
1278 log_error("ERROR: clSVMAlloc failed!\n");
1279 return -1;
1280 }
1281 memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
1282 streams[0] =
1283 clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
1284 typeSize * numDestItems, svmAtomicBuffer, NULL);
1285 }
1286 else
1287 {
1288 streams[0] =
1289 clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
1290 typeSize * numDestItems, &destItems[0], NULL);
1291 }
1292 if (!streams[0])
1293 {
1294 log_error("ERROR: Creating output array failed!\n");
1295 return -1;
1296 }
1297 // Create buffer for per-thread input/output data
1298 if (UseSVM())
1299 {
1300 if (gUseHostPtr)
1301 svmDataBuffer = (HostDataType *)malloc(
1302 typeSize * threadCount * NumNonAtomicVariablesPerThread());
1303 else
1304 svmDataBuffer = (HostDataType *)clSVMAlloc(
1305 context,
1306 CL_MEM_SVM_FINE_GRAIN_BUFFER
1307 | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS
1308 : 0),
1309 typeSize * threadCount * NumNonAtomicVariablesPerThread(), 0);
1310 if (!svmDataBuffer)
1311 {
1312 log_error("ERROR: clSVMAlloc failed!\n");
1313 return -1;
1314 }
1315 if (startRefValues.size())
1316 memcpy(svmDataBuffer, &startRefValues[0],
1317 typeSize * threadCount * NumNonAtomicVariablesPerThread());
1318 streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
1319 typeSize * threadCount
1320 * NumNonAtomicVariablesPerThread(),
1321 svmDataBuffer, NULL);
1322 }
1323 else
1324 {
1325 streams[1] = clCreateBuffer(
1326 context,
1327 ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR
1328 : CL_MEM_READ_WRITE)),
1329 typeSize * threadCount * NumNonAtomicVariablesPerThread(),
1330 startRefValues.size() ? &startRefValues[0] : 0, NULL);
1331 }
1332 if (!streams[1])
1333 {
1334 log_error("ERROR: Creating reference array failed!\n");
1335 return -1;
1336 }
1337 if (deviceThreadCount > 0)
1338 {
1339 cl_uint argInd = 0;
1340 /* Set the arguments */
1341 error =
1342 clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
1343 test_error(error, "Unable to set kernel argument");
1344 error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems),
1345 &numDestItems);
1346 test_error(error, "Unable to set indexed kernel argument");
1347 error =
1348 clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
1349 test_error(error, "Unable to set indexed kernel arguments");
1350 error =
1351 clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
1352 test_error(error, "Unable to set indexed kernel arguments");
1353 if (LocalMemory())
1354 {
1355 error =
1356 clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
1357 test_error(error, "Unable to set indexed local kernel argument");
1358 }
1359 if (LocalRefValues())
1360 {
1361 error =
1362 clSetKernelArg(kernel, argInd++,
1363 LocalRefValues() ? typeSize
1364 * (CurrentGroupSize()
1365 * NumNonAtomicVariablesPerThread())
1366 : 1,
1367 NULL);
1368 test_error(error, "Unable to set indexed kernel argument");
1369 }
1370 }
1371 /* Configure host threads */
1372 std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
1373 for (unsigned int t = 0; t < hostThreadCount; t++)
1374 {
1375 hostThreadContexts[t].test = this;
1376 hostThreadContexts[t].tid = deviceThreadCount + t;
1377 hostThreadContexts[t].threadCount = threadCount;
1378 hostThreadContexts[t].destMemory =
1379 UseSVM() ? svmAtomicBuffer : &destItems[0];
1380 hostThreadContexts[t].oldValues =
1381 UseSVM() ? svmDataBuffer : &refValues[0];
1382 }
1383
1384 if (deviceThreadCount > 0)
1385 {
1386 /* Run the kernel */
1387 threadNum[0] = deviceThreadCount;
1388 groupSize = CurrentGroupSize();
1389 error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum,
1390 &groupSize, 0, NULL, NULL);
1391 test_error(error, "Unable to execute test kernel");
1392 /* start device threads */
1393 error = clFlush(queue);
1394 test_error(error, "clFlush failed");
1395 }
1396
1397 /* Start host threads and wait for finish */
1398 if (hostThreadCount > 0)
1399 ThreadPool_Do(HostThreadFunction, hostThreadCount,
1400 &hostThreadContexts[0]);
1401
1402 if (UseSVM())
1403 {
1404 error = clFinish(queue);
1405 test_error(error, "clFinish failed");
1406 memcpy(&destItems[0], svmAtomicBuffer, typeSize * numDestItems);
1407 memcpy(&refValues[0], svmDataBuffer,
1408 typeSize * threadCount * NumNonAtomicVariablesPerThread());
1409 }
1410 else
1411 {
1412 if (deviceThreadCount > 0)
1413 {
1414 error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0,
1415 typeSize * numDestItems, &destItems[0],
1416 0, NULL, NULL);
1417 test_error(error, "Unable to read result value!");
1418 error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
1419 typeSize * deviceThreadCount
1420 * NumNonAtomicVariablesPerThread(),
1421 &refValues[0], 0, NULL, NULL);
1422 test_error(error, "Unable to read reference values!");
1423 }
1424 }
1425 bool dataVerified = false;
1426 // If we have an expectedFn, then we need to generate a final value to
1427 // compare against. If we don't have one, it's because we're comparing ref
1428 // values only
1429 for (cl_uint i = 0; i < numDestItems; i++)
1430 {
1431 HostDataType expected;
1432
1433 if (!ExpectedValue(expected, threadCount,
1434 startRefValues.size() ? &startRefValues[0] : 0, i))
1435 break; // no expected value function provided
1436
1437 if (expected != destItems[i])
1438 {
1439 std::stringstream logLine;
1440 logLine << "ERROR: Result " << i
1441 << " from kernel does not validate! (should be " << expected
1442 << ", was " << destItems[i] << ")\n";
1443 log_error("%s", logLine.str().c_str());
1444 for (i = 0; i < threadCount; i++)
1445 {
1446 logLine.str("");
1447 logLine << " --- " << i << " - ";
1448 if (startRefValues.size())
1449 logLine << startRefValues[i] << " -> " << refValues[i];
1450 else
1451 logLine << refValues[i];
1452 logLine << " --- ";
1453 if (i < numDestItems) logLine << destItems[i];
1454 logLine << "\n";
1455 log_info("%s", logLine.str().c_str());
1456 }
1457 if (!gDebug)
1458 {
1459 log_info("Program source:\n");
1460 log_info("%s\n", programLine);
1461 }
1462 return -1;
1463 }
1464 dataVerified = true;
1465 }
1466
1467 bool dataCorrect = false;
1468 /* Use the verify function (if provided) to also check the results */
1469 if (VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
1470 {
1471 if (!dataCorrect)
1472 {
1473 log_error("ERROR: Reference values did not validate!\n");
1474 std::stringstream logLine;
1475 for (cl_uint i = 0; i < threadCount; i++)
1476 for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
1477 {
1478 logLine.str("");
1479 logLine
1480 << " --- " << i << " - "
1481 << refValues[i * NumNonAtomicVariablesPerThread() + j]
1482 << " --- ";
1483 if (j == 0 && i < numDestItems) logLine << destItems[i];
1484 logLine << "\n";
1485 log_info("%s", logLine.str().c_str());
1486 }
1487 if (!gDebug)
1488 {
1489 log_info("Program source:\n");
1490 log_info("%s\n", programLine);
1491 }
1492 return -1;
1493 }
1494 }
1495 else if (!dataVerified)
1496 {
1497 log_error("ERROR: Test doesn't check total or refs; no values are "
1498 "verified!\n");
1499 return -1;
1500 }
1501
1502 if (OldValueCheck()
1503 && !(DeclaredInProgram()
1504 && !LocalMemory())) // don't test for programs scope global atomics
1505 // 'old' value has been overwritten by previous
1506 // clEnqueueNDRangeKernel
1507 {
1508 /* Re-write the starting value */
1509 for (size_t i = 0; i < numDestItems; i++) destItems[i] = _startValue;
1510 refValues[0] = 0;
1511 if (deviceThreadCount > 0)
1512 {
1513 error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0,
1514 typeSize * numDestItems, &destItems[0],
1515 0, NULL, NULL);
1516 test_error(error, "Unable to write starting values!");
1517
1518 /* Run the kernel once for a single thread, so we can verify that
1519 * the returned value is the original one */
1520 threadNum[0] = 1;
1521 error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum,
1522 threadNum, 0, NULL, NULL);
1523 test_error(error, "Unable to execute test kernel");
1524
1525 error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize,
1526 &refValues[0], 0, NULL, NULL);
1527 test_error(error, "Unable to read reference values!");
1528 }
1529 else
1530 {
1531 /* Start host thread */
1532 HostFunction(0, 1, &destItems[0], &refValues[0]);
1533 }
1534
1535 if (refValues[0] != _startValue) // destItems[0])
1536 {
1537 std::stringstream logLine;
1538 logLine << "ERROR: atomic function operated correctly but did NOT "
1539 "return correct 'old' value "
1540 " (should have been "
1541 << destItems[0] << ", returned " << refValues[0] << ")!\n";
1542 log_error("%s", logLine.str().c_str());
1543 if (!gDebug)
1544 {
1545 log_info("Program source:\n");
1546 log_info("%s\n", programLine);
1547 }
1548 return -1;
1549 }
1550 }
1551 if (UseSVM())
1552 {
1553 // the buffer object must first be released before the SVM buffer is
1554 // freed. The Wrapper Class method reset() will do that
1555 streams[0].reset();
1556 if (gUseHostPtr)
1557 free(svmAtomicBuffer);
1558 else
1559 clSVMFree(context, svmAtomicBuffer);
1560 streams[1].reset();
1561 if (gUseHostPtr)
1562 free(svmDataBuffer);
1563 else
1564 clSVMFree(context, svmDataBuffer);
1565 }
1566 _passCount++;
1567 return 0;
1568 }
1569
1570 #endif // COMMON_H_
1571