1 /*------------------------------------------------------------------------
2  * Vulkan Conformance Tests
3  * ------------------------
4  *
5  * Copyright (c) 2019 The Khronos Group Inc.
6  * Copyright (c) 2019 The Android Open Source Project
7  * Copyright (c) 2023 LunarG, Inc.
8  * Copyright (c) 2023 Nintendo
9  *
10  * Licensed under the Apache License, Version 2.0 (the "License");
11  * you may not use this file except in compliance with the License.
12  * You may obtain a copy of the License at
13  *
14  *      http://www.apache.org/licenses/LICENSE-2.0
15  *
16  * Unless required by applicable law or agreed to in writing, software
17  * distributed under the License is distributed on an "AS IS" BASIS,
18  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  * See the License for the specific language governing permissions and
20  * limitations under the License.
21  *
22  *//*!
23  * \file
24  * \brief Compute Shader Tests
25  *//*--------------------------------------------------------------------*/
26 
27 #include "vktComputeBasicComputeShaderTests.hpp"
28 #include "vktTestCase.hpp"
29 #include "vktTestCaseUtil.hpp"
30 #include "vktComputeTestsUtil.hpp"
31 #include "vktCustomInstancesDevices.hpp"
32 #include "vktAmberTestCase.hpp"
33 
34 #include "vkDefs.hpp"
35 #include "vkRef.hpp"
36 #include "vkRefUtil.hpp"
37 #include "vkPlatform.hpp"
38 #include "vkPrograms.hpp"
39 #include "vkRefUtil.hpp"
40 #include "vkMemUtil.hpp"
41 #include "vkBarrierUtil.hpp"
42 #include "vkQueryUtil.hpp"
43 #include "vkBuilderUtil.hpp"
44 #include "vkTypeUtil.hpp"
45 #include "vkDeviceUtil.hpp"
46 #include "vkCmdUtil.hpp"
47 #include "vkObjUtil.hpp"
48 #include "vkBufferWithMemory.hpp"
49 #include "vkSafetyCriticalUtil.hpp"
50 #include "vkImageWithMemory.hpp"
51 
52 #include "tcuCommandLine.hpp"
53 #include "tcuTestLog.hpp"
54 #include "tcuMaybe.hpp"
55 
56 #include "deStringUtil.hpp"
57 #include "deUniquePtr.hpp"
58 #include "deRandom.hpp"
59 
60 #include <vector>
61 #include <memory>
62 
63 using namespace vk;
64 
65 namespace vkt
66 {
67 namespace compute
68 {
69 namespace
70 {
71 
72 template <typename T, int size>
multiplyComponents(const tcu::Vector<T,size> & v)73 T multiplyComponents(const tcu::Vector<T, size> &v)
74 {
75     T accum = 1;
76     for (int i = 0; i < size; ++i)
77         accum *= v[i];
78     return accum;
79 }
80 
81 template <typename T>
squared(const T & a)82 inline T squared(const T &a)
83 {
84     return a * a;
85 }
86 
make2DImageCreateInfo(const tcu::IVec2 & imageSize,const VkImageUsageFlags usage)87 inline VkImageCreateInfo make2DImageCreateInfo(const tcu::IVec2 &imageSize, const VkImageUsageFlags usage)
88 {
89     const VkImageCreateInfo imageParams = {
90         VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,               // VkStructureType sType;
91         DE_NULL,                                           // const void* pNext;
92         0u,                                                // VkImageCreateFlags flags;
93         VK_IMAGE_TYPE_2D,                                  // VkImageType imageType;
94         VK_FORMAT_R32_UINT,                                // VkFormat format;
95         vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), // VkExtent3D extent;
96         1u,                                                // uint32_t mipLevels;
97         1u,                                                // uint32_t arrayLayers;
98         VK_SAMPLE_COUNT_1_BIT,                             // VkSampleCountFlagBits samples;
99         VK_IMAGE_TILING_OPTIMAL,                           // VkImageTiling tiling;
100         usage,                                             // VkImageUsageFlags usage;
101         VK_SHARING_MODE_EXCLUSIVE,                         // VkSharingMode sharingMode;
102         0u,                                                // uint32_t queueFamilyIndexCount;
103         DE_NULL,                                           // const uint32_t* pQueueFamilyIndices;
104         VK_IMAGE_LAYOUT_UNDEFINED,                         // VkImageLayout initialLayout;
105     };
106     return imageParams;
107 }
108 
makeBufferImageCopy(const tcu::IVec2 & imageSize)109 inline VkBufferImageCopy makeBufferImageCopy(const tcu::IVec2 &imageSize)
110 {
111     return compute::makeBufferImageCopy(vk::makeExtent3D(imageSize.x(), imageSize.y(), 1), 1u);
112 }
113 
114 enum BufferType
115 {
116     BUFFER_TYPE_UNIFORM,
117     BUFFER_TYPE_SSBO,
118 };
119 
120 class SharedVarTest : public vkt::TestCase
121 {
122 public:
123     SharedVarTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec3 &localSize,
124                   const tcu::IVec3 &workSize,
125                   const vk::ComputePipelineConstructionType computePipelineConstructionType);
126 
127     virtual void checkSupport(Context &context) const;
128     void initPrograms(SourceCollections &sourceCollections) const;
129     TestInstance *createInstance(Context &context) const;
130 
131 private:
132     const tcu::IVec3 m_localSize;
133     const tcu::IVec3 m_workSize;
134     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
135 };
136 
137 class SharedVarTestInstance : public vkt::TestInstance
138 {
139 public:
140     SharedVarTestInstance(Context &context, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
141                           const vk::ComputePipelineConstructionType computePipelineConstructionType);
142 
143     tcu::TestStatus iterate(void);
144 
145 private:
146     const tcu::IVec3 m_localSize;
147     const tcu::IVec3 m_workSize;
148     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
149 };
150 
SharedVarTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)151 SharedVarTest::SharedVarTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec3 &localSize,
152                              const tcu::IVec3 &workSize,
153                              const vk::ComputePipelineConstructionType computePipelineConstructionType)
154     : TestCase(testCtx, name)
155     , m_localSize(localSize)
156     , m_workSize(workSize)
157     , m_computePipelineConstructionType(computePipelineConstructionType)
158 {
159 }
160 
checkSupport(Context & context) const161 void SharedVarTest::checkSupport(Context &context) const
162 {
163     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
164                                   m_computePipelineConstructionType);
165 }
166 
initPrograms(SourceCollections & sourceCollections) const167 void SharedVarTest::initPrograms(SourceCollections &sourceCollections) const
168 {
169     const int workGroupSize  = multiplyComponents(m_localSize);
170     const int workGroupCount = multiplyComponents(m_workSize);
171     const int numValues      = workGroupSize * workGroupCount;
172 
173     std::ostringstream src;
174     src << "#version 310 es\n"
175         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
176         << ", local_size_z = " << m_localSize.z() << ") in;\n"
177         << "layout(binding = 0) writeonly buffer Output {\n"
178         << "    uint values[" << numValues << "];\n"
179         << "} sb_out;\n\n"
180         << "shared uint offsets[" << workGroupSize << "];\n\n"
181         << "void main (void) {\n"
182         << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
183         << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
184            "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
185         << "    uint globalOffs = localSize*globalNdx;\n"
186         << "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + "
187            "gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
188         << "\n"
189         << "    offsets[localSize-localOffs-1u] = globalOffs + localOffs*localOffs;\n"
190         << "    memoryBarrierShared();\n"
191         << "    barrier();\n"
192         << "    sb_out.values[globalOffs + localOffs] = offsets[localOffs];\n"
193         << "}\n";
194 
195     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
196 }
197 
createInstance(Context & context) const198 TestInstance *SharedVarTest::createInstance(Context &context) const
199 {
200     return new SharedVarTestInstance(context, m_localSize, m_workSize, m_computePipelineConstructionType);
201 }
202 
SharedVarTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)203 SharedVarTestInstance::SharedVarTestInstance(Context &context, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
204                                              const vk::ComputePipelineConstructionType computePipelineConstructionType)
205     : TestInstance(context)
206     , m_localSize(localSize)
207     , m_workSize(workSize)
208     , m_computePipelineConstructionType(computePipelineConstructionType)
209 {
210 }
211 
iterate(void)212 tcu::TestStatus SharedVarTestInstance::iterate(void)
213 {
214     const DeviceInterface &vk       = m_context.getDeviceInterface();
215     const VkDevice device           = m_context.getDevice();
216     const VkQueue queue             = m_context.getUniversalQueue();
217     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
218     Allocator &allocator            = m_context.getDefaultAllocator();
219 
220     const int workGroupSize  = multiplyComponents(m_localSize);
221     const int workGroupCount = multiplyComponents(m_workSize);
222 
223     // Create a buffer and host-visible memory for it
224 
225     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * workGroupSize * workGroupCount;
226     const BufferWithMemory buffer(vk, device, allocator,
227                                   makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
228                                   MemoryRequirement::HostVisible);
229 
230     // Create descriptor set
231 
232     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
233         DescriptorSetLayoutBuilder()
234             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
235             .build(vk, device));
236 
237     const Unique<VkDescriptorPool> descriptorPool(
238         DescriptorPoolBuilder()
239             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
240             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
241 
242     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
243 
244     const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
245     DescriptorSetUpdateBuilder()
246         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
247                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
248         .update(vk, device);
249 
250     // Perform the computation
251 
252     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
253                                     m_context.getBinaryCollection().get("comp"));
254     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
255     pipeline.buildPipeline();
256 
257     const VkBufferMemoryBarrier computeFinishBarrier =
258         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
259 
260     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
261     const Unique<VkCommandBuffer> cmdBuffer(
262         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
263 
264     // Start recording commands
265 
266     beginCommandBuffer(vk, *cmdBuffer);
267 
268     pipeline.bind(*cmdBuffer);
269     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
270                              &descriptorSet.get(), 0u, DE_NULL);
271 
272     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
273 
274     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
275                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &computeFinishBarrier, 0,
276                           (const VkImageMemoryBarrier *)DE_NULL);
277 
278     endCommandBuffer(vk, *cmdBuffer);
279 
280     // Wait for completion
281 
282     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
283 
284     // Validate the results
285 
286     const Allocation &bufferAllocation = buffer.getAllocation();
287     invalidateAlloc(vk, device, bufferAllocation);
288 
289     const uint32_t *bufferPtr = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
290 
291     for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
292     {
293         const int globalOffset = groupNdx * workGroupSize;
294         for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
295         {
296             const uint32_t res = bufferPtr[globalOffset + localOffset];
297             const uint32_t ref = globalOffset + squared(workGroupSize - localOffset - 1);
298 
299             if (res != ref)
300             {
301                 std::ostringstream msg;
302                 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
303                 return tcu::TestStatus::fail(msg.str());
304             }
305         }
306     }
307     return tcu::TestStatus::pass("Compute succeeded");
308 }
309 
310 class SharedVarAtomicOpTest : public vkt::TestCase
311 {
312 public:
313     SharedVarAtomicOpTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec3 &localSize,
314                           const tcu::IVec3 &workSize,
315                           const vk::ComputePipelineConstructionType computePipelineConstructionType);
316 
317     virtual void checkSupport(Context &context) const;
318     void initPrograms(SourceCollections &sourceCollections) const;
319     TestInstance *createInstance(Context &context) const;
320 
321 private:
322     const tcu::IVec3 m_localSize;
323     const tcu::IVec3 m_workSize;
324     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
325 };
326 
327 class SharedVarAtomicOpTestInstance : public vkt::TestInstance
328 {
329 public:
330     SharedVarAtomicOpTestInstance(Context &context, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
331                                   const vk::ComputePipelineConstructionType computePipelineConstructionType);
332 
333     tcu::TestStatus iterate(void);
334 
335 private:
336     const tcu::IVec3 m_localSize;
337     const tcu::IVec3 m_workSize;
338     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
339 };
340 
SharedVarAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)341 SharedVarAtomicOpTest::SharedVarAtomicOpTest(tcu::TestContext &testCtx, const std::string &name,
342                                              const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
343                                              const vk::ComputePipelineConstructionType computePipelineConstructionType)
344     : TestCase(testCtx, name)
345     , m_localSize(localSize)
346     , m_workSize(workSize)
347     , m_computePipelineConstructionType(computePipelineConstructionType)
348 {
349 }
350 
checkSupport(Context & context) const351 void SharedVarAtomicOpTest::checkSupport(Context &context) const
352 {
353     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
354                                   m_computePipelineConstructionType);
355 }
356 
initPrograms(SourceCollections & sourceCollections) const357 void SharedVarAtomicOpTest::initPrograms(SourceCollections &sourceCollections) const
358 {
359     const int workGroupSize  = multiplyComponents(m_localSize);
360     const int workGroupCount = multiplyComponents(m_workSize);
361     const int numValues      = workGroupSize * workGroupCount;
362 
363     std::ostringstream src;
364     src << "#version 310 es\n"
365         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
366         << ", local_size_z = " << m_localSize.z() << ") in;\n"
367         << "layout(binding = 0) writeonly buffer Output {\n"
368         << "    uint values[" << numValues << "];\n"
369         << "} sb_out;\n\n"
370         << "shared uint count;\n\n"
371         << "void main (void) {\n"
372         << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
373         << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
374            "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
375         << "    uint globalOffs = localSize*globalNdx;\n"
376         << "\n"
377         << "    count = 0u;\n"
378         << "    memoryBarrierShared();\n"
379         << "    barrier();\n"
380         << "    uint oldVal = atomicAdd(count, 1u);\n"
381         << "    sb_out.values[globalOffs+oldVal] = oldVal+1u;\n"
382         << "}\n";
383 
384     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
385 }
386 
createInstance(Context & context) const387 TestInstance *SharedVarAtomicOpTest::createInstance(Context &context) const
388 {
389     return new SharedVarAtomicOpTestInstance(context, m_localSize, m_workSize, m_computePipelineConstructionType);
390 }
391 
SharedVarAtomicOpTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)392 SharedVarAtomicOpTestInstance::SharedVarAtomicOpTestInstance(
393     Context &context, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
394     const vk::ComputePipelineConstructionType computePipelineConstructionType)
395     : TestInstance(context)
396     , m_localSize(localSize)
397     , m_workSize(workSize)
398     , m_computePipelineConstructionType(computePipelineConstructionType)
399 {
400 }
401 
iterate(void)402 tcu::TestStatus SharedVarAtomicOpTestInstance::iterate(void)
403 {
404     const DeviceInterface &vk       = m_context.getDeviceInterface();
405     const VkDevice device           = m_context.getDevice();
406     const VkQueue queue             = m_context.getUniversalQueue();
407     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
408     Allocator &allocator            = m_context.getDefaultAllocator();
409 
410     const int workGroupSize  = multiplyComponents(m_localSize);
411     const int workGroupCount = multiplyComponents(m_workSize);
412 
413     // Create a buffer and host-visible memory for it
414 
415     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * workGroupSize * workGroupCount;
416     const BufferWithMemory buffer(vk, device, allocator,
417                                   makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
418                                   MemoryRequirement::HostVisible);
419 
420     // Create descriptor set
421 
422     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
423         DescriptorSetLayoutBuilder()
424             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
425             .build(vk, device));
426 
427     const Unique<VkDescriptorPool> descriptorPool(
428         DescriptorPoolBuilder()
429             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
430             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
431 
432     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
433 
434     const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
435     DescriptorSetUpdateBuilder()
436         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
437                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
438         .update(vk, device);
439 
440     // Perform the computation
441 
442     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
443                                     m_context.getBinaryCollection().get("comp"));
444     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
445     pipeline.buildPipeline();
446 
447     const VkBufferMemoryBarrier computeFinishBarrier =
448         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
449 
450     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
451     const Unique<VkCommandBuffer> cmdBuffer(
452         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
453 
454     // Start recording commands
455 
456     beginCommandBuffer(vk, *cmdBuffer);
457 
458     pipeline.bind(*cmdBuffer);
459     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
460                              &descriptorSet.get(), 0u, DE_NULL);
461 
462     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
463 
464     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
465                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1u, &computeFinishBarrier, 0,
466                           (const VkImageMemoryBarrier *)DE_NULL);
467 
468     endCommandBuffer(vk, *cmdBuffer);
469 
470     // Wait for completion
471 
472     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
473 
474     // Validate the results
475 
476     const Allocation &bufferAllocation = buffer.getAllocation();
477     invalidateAlloc(vk, device, bufferAllocation);
478 
479     const uint32_t *bufferPtr = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
480 
481     for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
482     {
483         const int globalOffset = groupNdx * workGroupSize;
484         for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
485         {
486             const uint32_t res = bufferPtr[globalOffset + localOffset];
487             const uint32_t ref = localOffset + 1;
488 
489             if (res != ref)
490             {
491                 std::ostringstream msg;
492                 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
493                 return tcu::TestStatus::fail(msg.str());
494             }
495         }
496     }
497     return tcu::TestStatus::pass("Compute succeeded");
498 }
499 
500 class SSBOLocalBarrierTest : public vkt::TestCase
501 {
502 public:
503     SSBOLocalBarrierTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec3 &localSize,
504                          const tcu::IVec3 &workSize,
505                          const vk::ComputePipelineConstructionType computePipelineConstructionType);
506 
507     virtual void checkSupport(Context &context) const;
508     void initPrograms(SourceCollections &sourceCollections) const;
509     TestInstance *createInstance(Context &context) const;
510 
511 private:
512     const tcu::IVec3 m_localSize;
513     const tcu::IVec3 m_workSize;
514     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
515 };
516 
517 class SSBOLocalBarrierTestInstance : public vkt::TestInstance
518 {
519 public:
520     SSBOLocalBarrierTestInstance(Context &context, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
521                                  const vk::ComputePipelineConstructionType computePipelineConstructionType);
522 
523     tcu::TestStatus iterate(void);
524 
525 private:
526     const tcu::IVec3 m_localSize;
527     const tcu::IVec3 m_workSize;
528     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
529 };
530 
SSBOLocalBarrierTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)531 SSBOLocalBarrierTest::SSBOLocalBarrierTest(tcu::TestContext &testCtx, const std::string &name,
532                                            const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
533                                            const vk::ComputePipelineConstructionType computePipelineConstructionType)
534     : TestCase(testCtx, name)
535     , m_localSize(localSize)
536     , m_workSize(workSize)
537     , m_computePipelineConstructionType(computePipelineConstructionType)
538 {
539 }
540 
checkSupport(Context & context) const541 void SSBOLocalBarrierTest::checkSupport(Context &context) const
542 {
543     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
544                                   m_computePipelineConstructionType);
545 }
546 
initPrograms(SourceCollections & sourceCollections) const547 void SSBOLocalBarrierTest::initPrograms(SourceCollections &sourceCollections) const
548 {
549     const int workGroupSize  = multiplyComponents(m_localSize);
550     const int workGroupCount = multiplyComponents(m_workSize);
551     const int numValues      = workGroupSize * workGroupCount;
552 
553     std::ostringstream src;
554     src << "#version 310 es\n"
555         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
556         << ", local_size_z = " << m_localSize.z() << ") in;\n"
557         << "layout(binding = 0) coherent buffer Output {\n"
558         << "    uint values[" << numValues << "];\n"
559         << "} sb_out;\n\n"
560         << "void main (void) {\n"
561         << "    uint localSize  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_WorkGroupSize.z;\n"
562         << "    uint globalNdx  = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
563            "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
564         << "    uint globalOffs = localSize*globalNdx;\n"
565         << "    uint localOffs  = gl_WorkGroupSize.x*gl_WorkGroupSize.y*gl_LocalInvocationID.z + "
566            "gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x;\n"
567         << "\n"
568         << "    sb_out.values[globalOffs + localOffs] = globalOffs;\n"
569         << "    memoryBarrierBuffer();\n"
570         << "    barrier();\n"
571         << "    sb_out.values[globalOffs + ((localOffs+1u)%localSize)] += localOffs;\n" // += so we read and write
572         << "    memoryBarrierBuffer();\n"
573         << "    barrier();\n"
574         << "    sb_out.values[globalOffs + ((localOffs+2u)%localSize)] += localOffs;\n"
575         << "}\n";
576 
577     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
578 }
579 
createInstance(Context & context) const580 TestInstance *SSBOLocalBarrierTest::createInstance(Context &context) const
581 {
582     return new SSBOLocalBarrierTestInstance(context, m_localSize, m_workSize, m_computePipelineConstructionType);
583 }
584 
SSBOLocalBarrierTestInstance(Context & context,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)585 SSBOLocalBarrierTestInstance::SSBOLocalBarrierTestInstance(
586     Context &context, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
587     const vk::ComputePipelineConstructionType computePipelineConstructionType)
588     : TestInstance(context)
589     , m_localSize(localSize)
590     , m_workSize(workSize)
591     , m_computePipelineConstructionType(computePipelineConstructionType)
592 {
593 }
594 
iterate(void)595 tcu::TestStatus SSBOLocalBarrierTestInstance::iterate(void)
596 {
597     const DeviceInterface &vk       = m_context.getDeviceInterface();
598     const VkDevice device           = m_context.getDevice();
599     const VkQueue queue             = m_context.getUniversalQueue();
600     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
601     Allocator &allocator            = m_context.getDefaultAllocator();
602 
603     const int workGroupSize  = multiplyComponents(m_localSize);
604     const int workGroupCount = multiplyComponents(m_workSize);
605 
606     // Create a buffer and host-visible memory for it
607 
608     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * workGroupSize * workGroupCount;
609     const BufferWithMemory buffer(vk, device, allocator,
610                                   makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
611                                   MemoryRequirement::HostVisible);
612 
613     // Create descriptor set
614 
615     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
616         DescriptorSetLayoutBuilder()
617             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
618             .build(vk, device));
619 
620     const Unique<VkDescriptorPool> descriptorPool(
621         DescriptorPoolBuilder()
622             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
623             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
624 
625     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
626 
627     const VkDescriptorBufferInfo descriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
628     DescriptorSetUpdateBuilder()
629         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
630                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &descriptorInfo)
631         .update(vk, device);
632 
633     // Perform the computation
634 
635     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
636                                     m_context.getBinaryCollection().get("comp"));
637     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
638     pipeline.buildPipeline();
639 
640     const VkBufferMemoryBarrier computeFinishBarrier =
641         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
642 
643     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
644     const Unique<VkCommandBuffer> cmdBuffer(
645         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
646 
647     // Start recording commands
648 
649     beginCommandBuffer(vk, *cmdBuffer);
650 
651     pipeline.bind(*cmdBuffer);
652     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
653                              &descriptorSet.get(), 0u, DE_NULL);
654 
655     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
656 
657     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
658                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &computeFinishBarrier, 0,
659                           (const VkImageMemoryBarrier *)DE_NULL);
660 
661     endCommandBuffer(vk, *cmdBuffer);
662 
663     // Wait for completion
664 
665     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
666 
667     // Validate the results
668 
669     const Allocation &bufferAllocation = buffer.getAllocation();
670     invalidateAlloc(vk, device, bufferAllocation);
671 
672     const uint32_t *bufferPtr = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
673 
674     for (int groupNdx = 0; groupNdx < workGroupCount; ++groupNdx)
675     {
676         const int globalOffset = groupNdx * workGroupSize;
677         for (int localOffset = 0; localOffset < workGroupSize; ++localOffset)
678         {
679             const uint32_t res = bufferPtr[globalOffset + localOffset];
680             const int offs0    = localOffset - 1 < 0 ? ((localOffset + workGroupSize - 1) % workGroupSize) :
681                                                        ((localOffset - 1) % workGroupSize);
682             const int offs1    = localOffset - 2 < 0 ? ((localOffset + workGroupSize - 2) % workGroupSize) :
683                                                        ((localOffset - 2) % workGroupSize);
684             const uint32_t ref = static_cast<uint32_t>(globalOffset + offs0 + offs1);
685 
686             if (res != ref)
687             {
688                 std::ostringstream msg;
689                 msg << "Comparison failed for Output.values[" << (globalOffset + localOffset) << "]";
690                 return tcu::TestStatus::fail(msg.str());
691             }
692         }
693     }
694     return tcu::TestStatus::pass("Compute succeeded");
695 }
696 
697 class CopyImageToSSBOTest : public vkt::TestCase
698 {
699 public:
700     CopyImageToSSBOTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec2 &localSize,
701                         const tcu::IVec2 &imageSize,
702                         const vk::ComputePipelineConstructionType computePipelineConstructionType);
703 
704     virtual void checkSupport(Context &context) const;
705     void initPrograms(SourceCollections &sourceCollections) const;
706     TestInstance *createInstance(Context &context) const;
707 
708 private:
709     const tcu::IVec2 m_localSize;
710     const tcu::IVec2 m_imageSize;
711     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
712 };
713 
714 class CopyImageToSSBOTestInstance : public vkt::TestInstance
715 {
716 public:
717     CopyImageToSSBOTestInstance(Context &context, const tcu::IVec2 &localSize, const tcu::IVec2 &imageSize,
718                                 const vk::ComputePipelineConstructionType computePipelineConstructionType);
719 
720     tcu::TestStatus iterate(void);
721 
722 private:
723     const tcu::IVec2 m_localSize;
724     const tcu::IVec2 m_imageSize;
725     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
726 };
727 
CopyImageToSSBOTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)728 CopyImageToSSBOTest::CopyImageToSSBOTest(tcu::TestContext &testCtx, const std::string &name,
729                                          const tcu::IVec2 &localSize, const tcu::IVec2 &imageSize,
730                                          const vk::ComputePipelineConstructionType computePipelineConstructionType)
731     : TestCase(testCtx, name)
732     , m_localSize(localSize)
733     , m_imageSize(imageSize)
734     , m_computePipelineConstructionType(computePipelineConstructionType)
735 {
736     DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
737     DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
738 }
739 
checkSupport(Context & context) const740 void CopyImageToSSBOTest::checkSupport(Context &context) const
741 {
742     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
743                                   m_computePipelineConstructionType);
744 }
745 
initPrograms(SourceCollections & sourceCollections) const746 void CopyImageToSSBOTest::initPrograms(SourceCollections &sourceCollections) const
747 {
748     std::ostringstream src;
749     src << "#version 310 es\n"
750         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
751         << "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_srcImg;\n"
752         << "layout(binding = 0) writeonly buffer Output {\n"
753         << "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
754         << "} sb_out;\n\n"
755         << "void main (void) {\n"
756         << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
757         << "    uint value  = imageLoad(u_srcImg, ivec2(gl_GlobalInvocationID.xy)).x;\n"
758         << "    sb_out.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x] = value;\n"
759         << "}\n";
760 
761     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
762 }
763 
createInstance(Context & context) const764 TestInstance *CopyImageToSSBOTest::createInstance(Context &context) const
765 {
766     return new CopyImageToSSBOTestInstance(context, m_localSize, m_imageSize, m_computePipelineConstructionType);
767 }
768 
CopyImageToSSBOTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)769 CopyImageToSSBOTestInstance::CopyImageToSSBOTestInstance(
770     Context &context, const tcu::IVec2 &localSize, const tcu::IVec2 &imageSize,
771     const vk::ComputePipelineConstructionType computePipelineConstructionType)
772     : TestInstance(context)
773     , m_localSize(localSize)
774     , m_imageSize(imageSize)
775     , m_computePipelineConstructionType(computePipelineConstructionType)
776 {
777 }
778 
iterate(void)779 tcu::TestStatus CopyImageToSSBOTestInstance::iterate(void)
780 {
781     const DeviceInterface &vk       = m_context.getDeviceInterface();
782     const VkDevice device           = m_context.getDevice();
783     const VkQueue queue             = m_context.getUniversalQueue();
784     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
785     Allocator &allocator            = m_context.getDefaultAllocator();
786 
787     // Create an image
788 
789     const VkImageCreateInfo imageParams =
790         make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
791     const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
792 
793     const VkImageSubresourceRange subresourceRange =
794         makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
795     const Unique<VkImageView> imageView(
796         makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
797 
798     // Staging buffer (source data for image)
799 
800     const uint32_t imageArea           = multiplyComponents(m_imageSize);
801     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * imageArea;
802 
803     const BufferWithMemory stagingBuffer(vk, device, allocator,
804                                          makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
805                                          MemoryRequirement::HostVisible);
806 
807     // Populate the staging buffer with test data
808     {
809         de::Random rnd(0xab2c7);
810         const Allocation &stagingBufferAllocation = stagingBuffer.getAllocation();
811         uint32_t *bufferPtr                       = static_cast<uint32_t *>(stagingBufferAllocation.getHostPtr());
812         for (uint32_t i = 0; i < imageArea; ++i)
813             *bufferPtr++ = rnd.getUint32();
814 
815         flushAlloc(vk, device, stagingBufferAllocation);
816     }
817 
818     // Create a buffer to store shader output
819 
820     const BufferWithMemory outputBuffer(vk, device, allocator,
821                                         makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
822                                         MemoryRequirement::HostVisible);
823 
824     // Create descriptor set
825 
826     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
827         DescriptorSetLayoutBuilder()
828             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
829             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
830             .build(vk, device));
831 
832     const Unique<VkDescriptorPool> descriptorPool(
833         DescriptorPoolBuilder()
834             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
835             .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
836             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
837 
838     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
839 
840     // Set the bindings
841 
842     const VkDescriptorImageInfo imageDescriptorInfo =
843         makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
844     const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
845 
846     DescriptorSetUpdateBuilder()
847         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
848                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
849         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
850                      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
851         .update(vk, device);
852 
853     // Perform the computation
854     {
855         ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
856                                         m_context.getBinaryCollection().get("comp"));
857         pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
858         pipeline.buildPipeline();
859 
860         const VkBufferMemoryBarrier computeFinishBarrier = makeBufferMemoryBarrier(
861             VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
862         const tcu::IVec2 workSize = m_imageSize / m_localSize;
863 
864         // Prepare the command buffer
865 
866         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
867         const Unique<VkCommandBuffer> cmdBuffer(
868             allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
869 
870         // Start recording commands
871 
872         beginCommandBuffer(vk, *cmdBuffer);
873 
874         pipeline.bind(*cmdBuffer);
875         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
876                                  &descriptorSet.get(), 0u, DE_NULL);
877 
878         const std::vector<VkBufferImageCopy> bufferImageCopy(1, makeBufferImageCopy(m_imageSize));
879         copyBufferToImage(vk, *cmdBuffer, *stagingBuffer, bufferSizeBytes, bufferImageCopy, VK_IMAGE_ASPECT_COLOR_BIT,
880                           1, 1, *image, VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
881 
882         vk.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
883         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
884                               (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &computeFinishBarrier, 0,
885                               (const VkImageMemoryBarrier *)DE_NULL);
886 
887         endCommandBuffer(vk, *cmdBuffer);
888 
889         // Wait for completion
890 
891         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
892     }
893 
894     // Validate the results
895 
896     const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
897     invalidateAlloc(vk, device, outputBufferAllocation);
898 
899     const uint32_t *bufferPtr    = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
900     const uint32_t *refBufferPtr = static_cast<uint32_t *>(stagingBuffer.getAllocation().getHostPtr());
901 
902     for (uint32_t ndx = 0; ndx < imageArea; ++ndx)
903     {
904         const uint32_t res = *(bufferPtr + ndx);
905         const uint32_t ref = *(refBufferPtr + ndx);
906 
907         if (res != ref)
908         {
909             std::ostringstream msg;
910             msg << "Comparison failed for Output.values[" << ndx << "]";
911             return tcu::TestStatus::fail(msg.str());
912         }
913     }
914     return tcu::TestStatus::pass("Compute succeeded");
915 }
916 
917 class CopySSBOToImageTest : public vkt::TestCase
918 {
919 public:
920     CopySSBOToImageTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec2 &localSize,
921                         const tcu::IVec2 &imageSize,
922                         const vk::ComputePipelineConstructionType computePipelineConstructionType);
923 
924     virtual void checkSupport(Context &context) const;
925     void initPrograms(SourceCollections &sourceCollections) const;
926     TestInstance *createInstance(Context &context) const;
927 
928 private:
929     const tcu::IVec2 m_localSize;
930     const tcu::IVec2 m_imageSize;
931     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
932 };
933 
934 class CopySSBOToImageTestInstance : public vkt::TestInstance
935 {
936 public:
937     CopySSBOToImageTestInstance(Context &context, const tcu::IVec2 &localSize, const tcu::IVec2 &imageSize,
938                                 const vk::ComputePipelineConstructionType computePipelineConstructionType);
939 
940     tcu::TestStatus iterate(void);
941 
942 private:
943     const tcu::IVec2 m_localSize;
944     const tcu::IVec2 m_imageSize;
945     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
946 };
947 
CopySSBOToImageTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)948 CopySSBOToImageTest::CopySSBOToImageTest(tcu::TestContext &testCtx, const std::string &name,
949                                          const tcu::IVec2 &localSize, const tcu::IVec2 &imageSize,
950                                          const vk::ComputePipelineConstructionType computePipelineConstructionType)
951     : TestCase(testCtx, name)
952     , m_localSize(localSize)
953     , m_imageSize(imageSize)
954     , m_computePipelineConstructionType(computePipelineConstructionType)
955 {
956     DE_ASSERT(m_imageSize.x() % m_localSize.x() == 0);
957     DE_ASSERT(m_imageSize.y() % m_localSize.y() == 0);
958 }
959 
checkSupport(Context & context) const960 void CopySSBOToImageTest::checkSupport(Context &context) const
961 {
962     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
963                                   m_computePipelineConstructionType);
964 }
965 
initPrograms(SourceCollections & sourceCollections) const966 void CopySSBOToImageTest::initPrograms(SourceCollections &sourceCollections) const
967 {
968     std::ostringstream src;
969     src << "#version 310 es\n"
970         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y() << ") in;\n"
971         << "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_dstImg;\n"
972         << "layout(binding = 0) readonly buffer Input {\n"
973         << "    uint values[" << (m_imageSize.x() * m_imageSize.y()) << "];\n"
974         << "} sb_in;\n\n"
975         << "void main (void) {\n"
976         << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
977         << "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
978         << "    imageStore(u_dstImg, ivec2(gl_GlobalInvocationID.xy), uvec4(value, 0, 0, 0));\n"
979         << "}\n";
980 
981     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
982 }
983 
createInstance(Context & context) const984 TestInstance *CopySSBOToImageTest::createInstance(Context &context) const
985 {
986     return new CopySSBOToImageTestInstance(context, m_localSize, m_imageSize, m_computePipelineConstructionType);
987 }
988 
CopySSBOToImageTestInstance(Context & context,const tcu::IVec2 & localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)989 CopySSBOToImageTestInstance::CopySSBOToImageTestInstance(
990     Context &context, const tcu::IVec2 &localSize, const tcu::IVec2 &imageSize,
991     const vk::ComputePipelineConstructionType computePipelineConstructionType)
992     : TestInstance(context)
993     , m_localSize(localSize)
994     , m_imageSize(imageSize)
995     , m_computePipelineConstructionType(computePipelineConstructionType)
996 {
997 }
998 
iterate(void)999 tcu::TestStatus CopySSBOToImageTestInstance::iterate(void)
1000 {
1001     ContextCommonData data     = m_context.getContextCommonData();
1002     const DeviceInterface &vkd = data.vkd;
1003 
1004     // Create an image, a view, and the output buffer
1005     const VkImageSubresourceRange subresourceRange =
1006         makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
1007     ImageWithBuffer imageWithBuffer(
1008         vkd, data.device, data.allocator, vk::makeExtent3D(m_imageSize.x(), m_imageSize.y(), 1), VK_FORMAT_R32_UINT,
1009         VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT, vk::VK_IMAGE_TYPE_2D, subresourceRange);
1010 
1011     const uint32_t imageArea           = multiplyComponents(m_imageSize);
1012     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * imageArea;
1013 
1014     const BufferWithMemory inputBuffer(vkd, data.device, data.allocator,
1015                                        makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1016                                        MemoryRequirement::HostVisible);
1017 
1018     // Populate the buffer with test data
1019     {
1020         de::Random rnd(0x77238ac2);
1021         const Allocation &inputBufferAllocation = inputBuffer.getAllocation();
1022         uint32_t *bufferPtr                     = static_cast<uint32_t *>(inputBufferAllocation.getHostPtr());
1023         for (uint32_t i = 0; i < imageArea; ++i)
1024             *bufferPtr++ = rnd.getUint32();
1025 
1026         flushAlloc(vkd, data.device, inputBufferAllocation);
1027     }
1028 
1029     // Create descriptor set
1030     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1031         DescriptorSetLayoutBuilder()
1032             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1033             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
1034             .build(vkd, data.device));
1035 
1036     const Unique<VkDescriptorPool> descriptorPool(
1037         DescriptorPoolBuilder()
1038             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1039             .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
1040             .build(vkd, data.device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1041 
1042     const Unique<VkDescriptorSet> descriptorSet(
1043         makeDescriptorSet(vkd, data.device, *descriptorPool, *descriptorSetLayout));
1044 
1045     // Set the bindings
1046 
1047     const VkDescriptorImageInfo imageDescriptorInfo =
1048         makeDescriptorImageInfo(DE_NULL, imageWithBuffer.getImageView(), VK_IMAGE_LAYOUT_GENERAL);
1049     const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1050 
1051     DescriptorSetUpdateBuilder()
1052         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
1053                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1054         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
1055                      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
1056         .update(vkd, data.device);
1057 
1058     // Perform the computation
1059     {
1060         ComputePipelineWrapper pipeline(vkd, data.device, m_computePipelineConstructionType,
1061                                         m_context.getBinaryCollection().get("comp"));
1062         pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
1063         pipeline.buildPipeline();
1064 
1065         const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(
1066             VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1067 
1068         const VkImageMemoryBarrier imageLayoutBarrier =
1069             makeImageMemoryBarrier(0u, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL,
1070                                    imageWithBuffer.getImage(), subresourceRange);
1071 
1072         const tcu::IVec2 workSize = m_imageSize / m_localSize;
1073 
1074         // Prepare the command buffer
1075 
1076         const Unique<VkCommandPool> cmdPool(makeCommandPool(vkd, data.device, data.qfIndex));
1077         const Unique<VkCommandBuffer> cmdBuffer(
1078             allocateCommandBuffer(vkd, data.device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1079 
1080         // Start recording commands
1081 
1082         beginCommandBuffer(vkd, *cmdBuffer);
1083 
1084         pipeline.bind(*cmdBuffer);
1085         vkd.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
1086                                   &descriptorSet.get(), 0u, DE_NULL);
1087 
1088         vkd.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
1089                                (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1,
1090                                &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
1091         vkd.cmdDispatch(*cmdBuffer, workSize.x(), workSize.y(), 1u);
1092 
1093         copyImageToBuffer(vkd, *cmdBuffer, imageWithBuffer.getImage(), imageWithBuffer.getBuffer(), m_imageSize,
1094                           VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_GENERAL);
1095 
1096         endCommandBuffer(vkd, *cmdBuffer);
1097 
1098         // Wait for completion
1099 
1100         submitCommandsAndWait(vkd, data.device, data.queue, *cmdBuffer);
1101     }
1102 
1103     // Validate the results
1104 
1105     const Allocation &outputBufferAllocation = imageWithBuffer.getBufferAllocation();
1106     invalidateAlloc(vkd, data.device, outputBufferAllocation);
1107 
1108     const uint32_t *bufferPtr    = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
1109     const uint32_t *refBufferPtr = static_cast<uint32_t *>(inputBuffer.getAllocation().getHostPtr());
1110 
1111     for (uint32_t ndx = 0; ndx < imageArea; ++ndx)
1112     {
1113         const uint32_t res = *(bufferPtr + ndx);
1114         const uint32_t ref = *(refBufferPtr + ndx);
1115 
1116         if (res != ref)
1117         {
1118             std::ostringstream msg;
1119             msg << "Comparison failed for pixel " << ndx;
1120             return tcu::TestStatus::fail(msg.str());
1121         }
1122     }
1123     return tcu::TestStatus::pass("Compute succeeded");
1124 }
1125 
1126 class BufferToBufferInvertTest : public vkt::TestCase
1127 {
1128 public:
1129     virtual void checkSupport(Context &context) const;
1130     void initPrograms(SourceCollections &sourceCollections) const;
1131     TestInstance *createInstance(Context &context) const;
1132 
1133     static BufferToBufferInvertTest *UBOToSSBOInvertCase(
1134         tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues, const tcu::IVec3 &localSize,
1135         const tcu::IVec3 &workSize, const vk::ComputePipelineConstructionType computePipelineConstructionType);
1136 
1137     static BufferToBufferInvertTest *CopyInvertSSBOCase(
1138         tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues, const tcu::IVec3 &localSize,
1139         const tcu::IVec3 &workSize, const vk::ComputePipelineConstructionType computePipelineConstructionType);
1140 
1141 private:
1142     BufferToBufferInvertTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
1143                              const tcu::IVec3 &localSize, const tcu::IVec3 &workSize, const BufferType bufferType,
1144                              const vk::ComputePipelineConstructionType computePipelineConstructionType);
1145 
1146     const BufferType m_bufferType;
1147     const uint32_t m_numValues;
1148     const tcu::IVec3 m_localSize;
1149     const tcu::IVec3 m_workSize;
1150     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1151 };
1152 
1153 class BufferToBufferInvertTestInstance : public vkt::TestInstance
1154 {
1155 public:
1156     BufferToBufferInvertTestInstance(Context &context, const uint32_t numValues, const tcu::IVec3 &localSize,
1157                                      const tcu::IVec3 &workSize, const BufferType bufferType,
1158                                      const vk::ComputePipelineConstructionType computePipelineConstructionType);
1159 
1160     tcu::TestStatus iterate(void);
1161 
1162 private:
1163     const BufferType m_bufferType;
1164     const uint32_t m_numValues;
1165     const tcu::IVec3 m_localSize;
1166     const tcu::IVec3 m_workSize;
1167     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1168 };
1169 
BufferToBufferInvertTest(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType,const vk::ComputePipelineConstructionType computePipelineConstructionType)1170 BufferToBufferInvertTest::BufferToBufferInvertTest(
1171     tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues, const tcu::IVec3 &localSize,
1172     const tcu::IVec3 &workSize, const BufferType bufferType,
1173     const vk::ComputePipelineConstructionType computePipelineConstructionType)
1174     : TestCase(testCtx, name)
1175     , m_bufferType(bufferType)
1176     , m_numValues(numValues)
1177     , m_localSize(localSize)
1178     , m_workSize(workSize)
1179     , m_computePipelineConstructionType(computePipelineConstructionType)
1180 {
1181     DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1182     DE_ASSERT(m_bufferType == BUFFER_TYPE_UNIFORM || m_bufferType == BUFFER_TYPE_SSBO);
1183 }
1184 
UBOToSSBOInvertCase(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1185 BufferToBufferInvertTest *BufferToBufferInvertTest::UBOToSSBOInvertCase(
1186     tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues, const tcu::IVec3 &localSize,
1187     const tcu::IVec3 &workSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
1188 {
1189     return new BufferToBufferInvertTest(testCtx, name, numValues, localSize, workSize, BUFFER_TYPE_UNIFORM,
1190                                         computePipelineConstructionType);
1191 }
1192 
CopyInvertSSBOCase(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1193 BufferToBufferInvertTest *BufferToBufferInvertTest::CopyInvertSSBOCase(
1194     tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues, const tcu::IVec3 &localSize,
1195     const tcu::IVec3 &workSize, const vk::ComputePipelineConstructionType computePipelineConstructionType)
1196 {
1197     return new BufferToBufferInvertTest(testCtx, name, numValues, localSize, workSize, BUFFER_TYPE_SSBO,
1198                                         computePipelineConstructionType);
1199 }
1200 
checkSupport(Context & context) const1201 void BufferToBufferInvertTest::checkSupport(Context &context) const
1202 {
1203     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
1204                                   m_computePipelineConstructionType);
1205 }
1206 
initPrograms(SourceCollections & sourceCollections) const1207 void BufferToBufferInvertTest::initPrograms(SourceCollections &sourceCollections) const
1208 {
1209     std::ostringstream src;
1210     if (m_bufferType == BUFFER_TYPE_UNIFORM)
1211     {
1212         src << "#version 310 es\n"
1213             << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
1214             << ", local_size_z = " << m_localSize.z() << ") in;\n"
1215             << "layout(binding = 0) readonly uniform Input {\n"
1216             << "    uint values[" << m_numValues << "];\n"
1217             << "} ub_in;\n"
1218             << "layout(binding = 1, std140) writeonly buffer Output {\n"
1219             << "    uint values[" << m_numValues << "];\n"
1220             << "} sb_out;\n"
1221             << "void main (void) {\n"
1222             << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1223             << "    uint numValuesPerInv = uint(ub_in.values.length()) / (size.x*size.y*size.z);\n"
1224             << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
1225                "gl_GlobalInvocationID.x;\n"
1226             << "    uint offset          = numValuesPerInv*groupNdx;\n"
1227             << "\n"
1228             << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1229             << "        sb_out.values[offset + ndx] = ~ub_in.values[offset + ndx];\n"
1230             << "}\n";
1231     }
1232     else if (m_bufferType == BUFFER_TYPE_SSBO)
1233     {
1234         src << "#version 310 es\n"
1235             << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
1236             << ", local_size_z = " << m_localSize.z() << ") in;\n"
1237             << "layout(binding = 0, std140) readonly buffer Input {\n"
1238             << "    uint values[" << m_numValues << "];\n"
1239             << "} sb_in;\n"
1240             << "layout (binding = 1, std140) writeonly buffer Output {\n"
1241             << "    uint values[" << m_numValues << "];\n"
1242             << "} sb_out;\n"
1243             << "void main (void) {\n"
1244             << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1245             << "    uint numValuesPerInv = uint(sb_in.values.length()) / (size.x*size.y*size.z);\n"
1246             << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
1247                "gl_GlobalInvocationID.x;\n"
1248             << "    uint offset          = numValuesPerInv*groupNdx;\n"
1249             << "\n"
1250             << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1251             << "        sb_out.values[offset + ndx] = ~sb_in.values[offset + ndx];\n"
1252             << "}\n";
1253     }
1254 
1255     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1256 }
1257 
createInstance(Context & context) const1258 TestInstance *BufferToBufferInvertTest::createInstance(Context &context) const
1259 {
1260     return new BufferToBufferInvertTestInstance(context, m_numValues, m_localSize, m_workSize, m_bufferType,
1261                                                 m_computePipelineConstructionType);
1262 }
1263 
BufferToBufferInvertTestInstance(Context & context,const uint32_t numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const BufferType bufferType,const vk::ComputePipelineConstructionType computePipelineConstructionType)1264 BufferToBufferInvertTestInstance::BufferToBufferInvertTestInstance(
1265     Context &context, const uint32_t numValues, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
1266     const BufferType bufferType, const vk::ComputePipelineConstructionType computePipelineConstructionType)
1267     : TestInstance(context)
1268     , m_bufferType(bufferType)
1269     , m_numValues(numValues)
1270     , m_localSize(localSize)
1271     , m_workSize(workSize)
1272     , m_computePipelineConstructionType(computePipelineConstructionType)
1273 {
1274 }
1275 
iterate(void)1276 tcu::TestStatus BufferToBufferInvertTestInstance::iterate(void)
1277 {
1278     const DeviceInterface &vk       = m_context.getDeviceInterface();
1279     const VkDevice device           = m_context.getDevice();
1280     const VkQueue queue             = m_context.getUniversalQueue();
1281     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1282     Allocator &allocator            = m_context.getDefaultAllocator();
1283 
1284     // Customize the test based on buffer type
1285 
1286     const VkBufferUsageFlags inputBufferUsageFlags =
1287         (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT : VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
1288     const VkDescriptorType inputBufferDescriptorType =
1289         (m_bufferType == BUFFER_TYPE_UNIFORM ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
1290     const uint32_t randomSeed = (m_bufferType == BUFFER_TYPE_UNIFORM ? 0x111223f : 0x124fef);
1291 
1292     // Create an input buffer
1293 
1294     const VkDeviceSize bufferSizeBytes = sizeof(tcu::UVec4) * m_numValues;
1295     const BufferWithMemory inputBuffer(vk, device, allocator,
1296                                        makeBufferCreateInfo(bufferSizeBytes, inputBufferUsageFlags),
1297                                        MemoryRequirement::HostVisible);
1298 
1299     // Fill the input buffer with data
1300     {
1301         de::Random rnd(randomSeed);
1302         const Allocation &inputBufferAllocation = inputBuffer.getAllocation();
1303         tcu::UVec4 *bufferPtr                   = static_cast<tcu::UVec4 *>(inputBufferAllocation.getHostPtr());
1304         for (uint32_t i = 0; i < m_numValues; ++i)
1305             bufferPtr[i].x() = rnd.getUint32();
1306 
1307         flushAlloc(vk, device, inputBufferAllocation);
1308     }
1309 
1310     // Create an output buffer
1311 
1312     const BufferWithMemory outputBuffer(vk, device, allocator,
1313                                         makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1314                                         MemoryRequirement::HostVisible);
1315 
1316     // Create descriptor set
1317 
1318     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1319         DescriptorSetLayoutBuilder()
1320             .addSingleBinding(inputBufferDescriptorType, VK_SHADER_STAGE_COMPUTE_BIT)
1321             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1322             .build(vk, device));
1323 
1324     const Unique<VkDescriptorPool> descriptorPool(
1325         DescriptorPoolBuilder()
1326             .addType(inputBufferDescriptorType)
1327             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1328             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1329 
1330     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1331 
1332     const VkDescriptorBufferInfo inputBufferDescriptorInfo =
1333         makeDescriptorBufferInfo(*inputBuffer, 0ull, bufferSizeBytes);
1334     const VkDescriptorBufferInfo outputBufferDescriptorInfo =
1335         makeDescriptorBufferInfo(*outputBuffer, 0ull, bufferSizeBytes);
1336     DescriptorSetUpdateBuilder()
1337         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u), inputBufferDescriptorType,
1338                      &inputBufferDescriptorInfo)
1339         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
1340                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
1341         .update(vk, device);
1342 
1343     // Perform the computation
1344 
1345     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
1346                                     m_context.getBinaryCollection().get("comp"));
1347     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
1348     pipeline.buildPipeline();
1349 
1350     const VkBufferMemoryBarrier hostWriteBarrier = makeBufferMemoryBarrier(
1351         VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, bufferSizeBytes);
1352 
1353     const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(
1354         VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, bufferSizeBytes);
1355 
1356     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1357     const Unique<VkCommandBuffer> cmdBuffer(
1358         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1359 
1360     // Start recording commands
1361 
1362     beginCommandBuffer(vk, *cmdBuffer);
1363 
1364     pipeline.bind(*cmdBuffer);
1365     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
1366                              &descriptorSet.get(), 0u, DE_NULL);
1367 
1368     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
1369                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &hostWriteBarrier, 0,
1370                           (const VkImageMemoryBarrier *)DE_NULL);
1371     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1372     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
1373                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &shaderWriteBarrier, 0,
1374                           (const VkImageMemoryBarrier *)DE_NULL);
1375 
1376     endCommandBuffer(vk, *cmdBuffer);
1377 
1378     // Wait for completion
1379 
1380     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1381 
1382     // Validate the results
1383 
1384     const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
1385     invalidateAlloc(vk, device, outputBufferAllocation);
1386 
1387     const tcu::UVec4 *bufferPtr    = static_cast<tcu::UVec4 *>(outputBufferAllocation.getHostPtr());
1388     const tcu::UVec4 *refBufferPtr = static_cast<tcu::UVec4 *>(inputBuffer.getAllocation().getHostPtr());
1389 
1390     for (uint32_t ndx = 0; ndx < m_numValues; ++ndx)
1391     {
1392         const uint32_t res = bufferPtr[ndx].x();
1393         const uint32_t ref = ~refBufferPtr[ndx].x();
1394 
1395         if (res != ref)
1396         {
1397             std::ostringstream msg;
1398             msg << "Comparison failed for Output.values[" << ndx << "]";
1399             return tcu::TestStatus::fail(msg.str());
1400         }
1401     }
1402     return tcu::TestStatus::pass("Compute succeeded");
1403 }
1404 
1405 class InvertSSBOInPlaceTest : public vkt::TestCase
1406 {
1407 public:
1408     InvertSSBOInPlaceTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
1409                           const bool sized, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
1410                           const vk::ComputePipelineConstructionType computePipelineConstructionType);
1411 
1412     virtual void checkSupport(Context &context) const;
1413     void initPrograms(SourceCollections &sourceCollections) const;
1414     TestInstance *createInstance(Context &context) const;
1415 
1416 private:
1417     const uint32_t m_numValues;
1418     const bool m_sized;
1419     const tcu::IVec3 m_localSize;
1420     const tcu::IVec3 m_workSize;
1421     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1422 };
1423 
1424 class InvertSSBOInPlaceTestInstance : public vkt::TestInstance
1425 {
1426 public:
1427     InvertSSBOInPlaceTestInstance(Context &context, const uint32_t numValues, const tcu::IVec3 &localSize,
1428                                   const tcu::IVec3 &workSize,
1429                                   const vk::ComputePipelineConstructionType computePipelineConstructionType);
1430 
1431     tcu::TestStatus iterate(void);
1432 
1433 private:
1434     const uint32_t m_numValues;
1435     const tcu::IVec3 m_localSize;
1436     const tcu::IVec3 m_workSize;
1437     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1438 };
1439 
InvertSSBOInPlaceTest(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1440 InvertSSBOInPlaceTest::InvertSSBOInPlaceTest(tcu::TestContext &testCtx, const std::string &name,
1441                                              const uint32_t numValues, const bool sized, const tcu::IVec3 &localSize,
1442                                              const tcu::IVec3 &workSize,
1443                                              const vk::ComputePipelineConstructionType computePipelineConstructionType)
1444     : TestCase(testCtx, name)
1445     , m_numValues(numValues)
1446     , m_sized(sized)
1447     , m_localSize(localSize)
1448     , m_workSize(workSize)
1449     , m_computePipelineConstructionType(computePipelineConstructionType)
1450 {
1451     DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1452 }
1453 
checkSupport(Context & context) const1454 void InvertSSBOInPlaceTest::checkSupport(Context &context) const
1455 {
1456     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
1457                                   m_computePipelineConstructionType);
1458 }
1459 
initPrograms(SourceCollections & sourceCollections) const1460 void InvertSSBOInPlaceTest::initPrograms(SourceCollections &sourceCollections) const
1461 {
1462     std::ostringstream src;
1463     src << "#version 310 es\n"
1464         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
1465         << ", local_size_z = " << m_localSize.z() << ") in;\n"
1466         << "layout(binding = 0) buffer InOut {\n"
1467         << "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1468         << "} sb_inout;\n"
1469         << "void main (void) {\n"
1470         << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1471         << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
1472         << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
1473            "gl_GlobalInvocationID.x;\n"
1474         << "    uint offset          = numValuesPerInv*groupNdx;\n"
1475         << "\n"
1476         << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1477         << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
1478         << "}\n";
1479 
1480     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1481 }
1482 
createInstance(Context & context) const1483 TestInstance *InvertSSBOInPlaceTest::createInstance(Context &context) const
1484 {
1485     return new InvertSSBOInPlaceTestInstance(context, m_numValues, m_localSize, m_workSize,
1486                                              m_computePipelineConstructionType);
1487 }
1488 
InvertSSBOInPlaceTestInstance(Context & context,const uint32_t numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1489 InvertSSBOInPlaceTestInstance::InvertSSBOInPlaceTestInstance(
1490     Context &context, const uint32_t numValues, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
1491     const vk::ComputePipelineConstructionType computePipelineConstructionType)
1492     : TestInstance(context)
1493     , m_numValues(numValues)
1494     , m_localSize(localSize)
1495     , m_workSize(workSize)
1496     , m_computePipelineConstructionType(computePipelineConstructionType)
1497 {
1498 }
1499 
iterate(void)1500 tcu::TestStatus InvertSSBOInPlaceTestInstance::iterate(void)
1501 {
1502     const DeviceInterface &vk       = m_context.getDeviceInterface();
1503     const VkDevice device           = m_context.getDevice();
1504     const VkQueue queue             = m_context.getUniversalQueue();
1505     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1506     Allocator &allocator            = m_context.getDefaultAllocator();
1507 
1508     // Create an input/output buffer
1509 
1510     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * m_numValues;
1511     const BufferWithMemory buffer(vk, device, allocator,
1512                                   makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1513                                   MemoryRequirement::HostVisible);
1514 
1515     // Fill the buffer with data
1516 
1517     typedef std::vector<uint32_t> data_vector_t;
1518     data_vector_t inputData(m_numValues);
1519 
1520     {
1521         de::Random rnd(0x82ce7f);
1522         const Allocation &bufferAllocation = buffer.getAllocation();
1523         uint32_t *bufferPtr                = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
1524         for (uint32_t i = 0; i < m_numValues; ++i)
1525             inputData[i] = *bufferPtr++ = rnd.getUint32();
1526 
1527         flushAlloc(vk, device, bufferAllocation);
1528     }
1529 
1530     // Create descriptor set
1531 
1532     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1533         DescriptorSetLayoutBuilder()
1534             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1535             .build(vk, device));
1536 
1537     const Unique<VkDescriptorPool> descriptorPool(
1538         DescriptorPoolBuilder()
1539             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
1540             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1541 
1542     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1543 
1544     const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
1545     DescriptorSetUpdateBuilder()
1546         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
1547                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
1548         .update(vk, device);
1549 
1550     // Perform the computation
1551 
1552     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
1553                                     m_context.getBinaryCollection().get("comp"));
1554     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
1555     pipeline.buildPipeline();
1556 
1557     const VkBufferMemoryBarrier hostWriteBarrier =
1558         makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1559 
1560     const VkBufferMemoryBarrier shaderWriteBarrier =
1561         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
1562 
1563     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1564     const Unique<VkCommandBuffer> cmdBuffer(
1565         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1566 
1567     // Start recording commands
1568 
1569     beginCommandBuffer(vk, *cmdBuffer);
1570 
1571     pipeline.bind(*cmdBuffer);
1572     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
1573                              &descriptorSet.get(), 0u, DE_NULL);
1574 
1575     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
1576                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &hostWriteBarrier, 0,
1577                           (const VkImageMemoryBarrier *)DE_NULL);
1578     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1579     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
1580                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &shaderWriteBarrier, 0,
1581                           (const VkImageMemoryBarrier *)DE_NULL);
1582 
1583     endCommandBuffer(vk, *cmdBuffer);
1584 
1585     // Wait for completion
1586 
1587     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1588 
1589     // Validate the results
1590 
1591     const Allocation &bufferAllocation = buffer.getAllocation();
1592     invalidateAlloc(vk, device, bufferAllocation);
1593 
1594     const uint32_t *bufferPtr = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
1595 
1596     for (uint32_t ndx = 0; ndx < m_numValues; ++ndx)
1597     {
1598         const uint32_t res = bufferPtr[ndx];
1599         const uint32_t ref = ~inputData[ndx];
1600 
1601         if (res != ref)
1602         {
1603             std::ostringstream msg;
1604             msg << "Comparison failed for InOut.values[" << ndx << "]";
1605             return tcu::TestStatus::fail(msg.str());
1606         }
1607     }
1608     return tcu::TestStatus::pass("Compute succeeded");
1609 }
1610 
1611 class WriteToMultipleSSBOTest : public vkt::TestCase
1612 {
1613 public:
1614     WriteToMultipleSSBOTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
1615                             const bool sized, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
1616                             const vk::ComputePipelineConstructionType computePipelineConstructionType);
1617 
1618     virtual void checkSupport(Context &context) const;
1619     void initPrograms(SourceCollections &sourceCollections) const;
1620     TestInstance *createInstance(Context &context) const;
1621 
1622 private:
1623     const uint32_t m_numValues;
1624     const bool m_sized;
1625     const tcu::IVec3 m_localSize;
1626     const tcu::IVec3 m_workSize;
1627     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1628 };
1629 
1630 class WriteToMultipleSSBOTestInstance : public vkt::TestInstance
1631 {
1632 public:
1633     WriteToMultipleSSBOTestInstance(Context &context, const uint32_t numValues, const tcu::IVec3 &localSize,
1634                                     const tcu::IVec3 &workSize,
1635                                     const vk::ComputePipelineConstructionType computePipelineConstructionType);
1636 
1637     tcu::TestStatus iterate(void);
1638 
1639 private:
1640     const uint32_t m_numValues;
1641     const tcu::IVec3 m_localSize;
1642     const tcu::IVec3 m_workSize;
1643     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1644 };
1645 
WriteToMultipleSSBOTest(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const bool sized,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1646 WriteToMultipleSSBOTest::WriteToMultipleSSBOTest(
1647     tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues, const bool sized,
1648     const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
1649     const vk::ComputePipelineConstructionType computePipelineConstructionType)
1650     : TestCase(testCtx, name)
1651     , m_numValues(numValues)
1652     , m_sized(sized)
1653     , m_localSize(localSize)
1654     , m_workSize(workSize)
1655     , m_computePipelineConstructionType(computePipelineConstructionType)
1656 {
1657     DE_ASSERT(m_numValues % (multiplyComponents(m_workSize) * multiplyComponents(m_localSize)) == 0);
1658 }
1659 
checkSupport(Context & context) const1660 void WriteToMultipleSSBOTest::checkSupport(Context &context) const
1661 {
1662     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
1663                                   m_computePipelineConstructionType);
1664 }
1665 
initPrograms(SourceCollections & sourceCollections) const1666 void WriteToMultipleSSBOTest::initPrograms(SourceCollections &sourceCollections) const
1667 {
1668     std::ostringstream src;
1669     src << "#version 310 es\n"
1670         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
1671         << ", local_size_z = " << m_localSize.z() << ") in;\n"
1672         << "layout(binding = 0) writeonly buffer Out0 {\n"
1673         << "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1674         << "} sb_out0;\n"
1675         << "layout(binding = 1) writeonly buffer Out1 {\n"
1676         << "    uint values[" << (m_sized ? de::toString(m_numValues) : "") << "];\n"
1677         << "} sb_out1;\n"
1678         << "void main (void) {\n"
1679         << "    uvec3 size      = gl_NumWorkGroups * gl_WorkGroupSize;\n"
1680         << "    uint groupNdx   = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
1681            "gl_GlobalInvocationID.x;\n"
1682         << "\n"
1683         << "    {\n"
1684         << "        uint numValuesPerInv = uint(sb_out0.values.length()) / (size.x*size.y*size.z);\n"
1685         << "        uint offset          = numValuesPerInv*groupNdx;\n"
1686         << "\n"
1687         << "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1688         << "            sb_out0.values[offset + ndx] = offset + ndx;\n"
1689         << "    }\n"
1690         << "    {\n"
1691         << "        uint numValuesPerInv = uint(sb_out1.values.length()) / (size.x*size.y*size.z);\n"
1692         << "        uint offset          = numValuesPerInv*groupNdx;\n"
1693         << "\n"
1694         << "        for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
1695         << "            sb_out1.values[offset + ndx] = uint(sb_out1.values.length()) - offset - ndx;\n"
1696         << "    }\n"
1697         << "}\n";
1698 
1699     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
1700 }
1701 
createInstance(Context & context) const1702 TestInstance *WriteToMultipleSSBOTest::createInstance(Context &context) const
1703 {
1704     return new WriteToMultipleSSBOTestInstance(context, m_numValues, m_localSize, m_workSize,
1705                                                m_computePipelineConstructionType);
1706 }
1707 
WriteToMultipleSSBOTestInstance(Context & context,const uint32_t numValues,const tcu::IVec3 & localSize,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1708 WriteToMultipleSSBOTestInstance::WriteToMultipleSSBOTestInstance(
1709     Context &context, const uint32_t numValues, const tcu::IVec3 &localSize, const tcu::IVec3 &workSize,
1710     const vk::ComputePipelineConstructionType computePipelineConstructionType)
1711     : TestInstance(context)
1712     , m_numValues(numValues)
1713     , m_localSize(localSize)
1714     , m_workSize(workSize)
1715     , m_computePipelineConstructionType(computePipelineConstructionType)
1716 {
1717 }
1718 
iterate(void)1719 tcu::TestStatus WriteToMultipleSSBOTestInstance::iterate(void)
1720 {
1721     const DeviceInterface &vk       = m_context.getDeviceInterface();
1722     const VkDevice device           = m_context.getDevice();
1723     const VkQueue queue             = m_context.getUniversalQueue();
1724     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1725     Allocator &allocator            = m_context.getDefaultAllocator();
1726 
1727     // Create two output buffers
1728 
1729     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * m_numValues;
1730     const BufferWithMemory buffer0(vk, device, allocator,
1731                                    makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1732                                    MemoryRequirement::HostVisible);
1733     const BufferWithMemory buffer1(vk, device, allocator,
1734                                    makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1735                                    MemoryRequirement::HostVisible);
1736 
1737     // Create descriptor set
1738 
1739     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1740         DescriptorSetLayoutBuilder()
1741             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1742             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1743             .build(vk, device));
1744 
1745     const Unique<VkDescriptorPool> descriptorPool(
1746         DescriptorPoolBuilder()
1747             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1748             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1749 
1750     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1751 
1752     const VkDescriptorBufferInfo buffer0DescriptorInfo = makeDescriptorBufferInfo(*buffer0, 0ull, bufferSizeBytes);
1753     const VkDescriptorBufferInfo buffer1DescriptorInfo = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
1754     DescriptorSetUpdateBuilder()
1755         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
1756                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer0DescriptorInfo)
1757         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
1758                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &buffer1DescriptorInfo)
1759         .update(vk, device);
1760 
1761     // Perform the computation
1762 
1763     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
1764                                     m_context.getBinaryCollection().get("comp"));
1765     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
1766     pipeline.buildPipeline();
1767 
1768     const VkBufferMemoryBarrier shaderWriteBarriers[] = {
1769         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer0, 0ull, bufferSizeBytes),
1770         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes)};
1771 
1772     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
1773     const Unique<VkCommandBuffer> cmdBuffer(
1774         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
1775 
1776     // Start recording commands
1777 
1778     beginCommandBuffer(vk, *cmdBuffer);
1779 
1780     pipeline.bind(*cmdBuffer);
1781     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
1782                              &descriptorSet.get(), 0u, DE_NULL);
1783 
1784     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
1785     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
1786                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL,
1787                           DE_LENGTH_OF_ARRAY(shaderWriteBarriers), shaderWriteBarriers, 0,
1788                           (const VkImageMemoryBarrier *)DE_NULL);
1789 
1790     endCommandBuffer(vk, *cmdBuffer);
1791 
1792     // Wait for completion
1793 
1794     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
1795 
1796     // Validate the results
1797     {
1798         const Allocation &buffer0Allocation = buffer0.getAllocation();
1799         invalidateAlloc(vk, device, buffer0Allocation);
1800         const uint32_t *buffer0Ptr = static_cast<uint32_t *>(buffer0Allocation.getHostPtr());
1801 
1802         for (uint32_t ndx = 0; ndx < m_numValues; ++ndx)
1803         {
1804             const uint32_t res = buffer0Ptr[ndx];
1805             const uint32_t ref = ndx;
1806 
1807             if (res != ref)
1808             {
1809                 std::ostringstream msg;
1810                 msg << "Comparison failed for Out0.values[" << ndx << "] res=" << res << " ref=" << ref;
1811                 return tcu::TestStatus::fail(msg.str());
1812             }
1813         }
1814     }
1815     {
1816         const Allocation &buffer1Allocation = buffer1.getAllocation();
1817         invalidateAlloc(vk, device, buffer1Allocation);
1818         const uint32_t *buffer1Ptr = static_cast<uint32_t *>(buffer1Allocation.getHostPtr());
1819 
1820         for (uint32_t ndx = 0; ndx < m_numValues; ++ndx)
1821         {
1822             const uint32_t res = buffer1Ptr[ndx];
1823             const uint32_t ref = m_numValues - ndx;
1824 
1825             if (res != ref)
1826             {
1827                 std::ostringstream msg;
1828                 msg << "Comparison failed for Out1.values[" << ndx << "] res=" << res << " ref=" << ref;
1829                 return tcu::TestStatus::fail(msg.str());
1830             }
1831         }
1832     }
1833     return tcu::TestStatus::pass("Compute succeeded");
1834 }
1835 
1836 class SSBOBarrierTest : public vkt::TestCase
1837 {
1838 public:
1839     SSBOBarrierTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec3 &workSize,
1840                     const vk::ComputePipelineConstructionType computePipelineConstructionType);
1841 
1842     virtual void checkSupport(Context &context) const;
1843     void initPrograms(SourceCollections &sourceCollections) const;
1844     TestInstance *createInstance(Context &context) const;
1845 
1846 private:
1847     const tcu::IVec3 m_workSize;
1848     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1849 };
1850 
1851 class SSBOBarrierTestInstance : public vkt::TestInstance
1852 {
1853 public:
1854     SSBOBarrierTestInstance(Context &context, const tcu::IVec3 &workSize,
1855                             const vk::ComputePipelineConstructionType computePipelineConstructionType);
1856 
1857     tcu::TestStatus iterate(void);
1858 
1859 private:
1860     const tcu::IVec3 m_workSize;
1861     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
1862 };
1863 
SSBOBarrierTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1864 SSBOBarrierTest::SSBOBarrierTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec3 &workSize,
1865                                  const vk::ComputePipelineConstructionType computePipelineConstructionType)
1866     : TestCase(testCtx, name)
1867     , m_workSize(workSize)
1868     , m_computePipelineConstructionType(computePipelineConstructionType)
1869 {
1870 }
1871 
checkSupport(Context & context) const1872 void SSBOBarrierTest::checkSupport(Context &context) const
1873 {
1874     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
1875                                   m_computePipelineConstructionType);
1876 }
1877 
initPrograms(SourceCollections & sourceCollections) const1878 void SSBOBarrierTest::initPrograms(SourceCollections &sourceCollections) const
1879 {
1880     sourceCollections.glslSources.add("comp0")
1881         << glu::ComputeSource("#version 310 es\n"
1882                               "layout (local_size_x = 1) in;\n"
1883                               "layout(binding = 2) readonly uniform Constants {\n"
1884                               "    uint u_baseVal;\n"
1885                               "};\n"
1886                               "layout(binding = 1) writeonly buffer Output {\n"
1887                               "    uint values[];\n"
1888                               "};\n"
1889                               "void main (void) {\n"
1890                               "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
1891                               "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1892                               "    values[offset] = u_baseVal + offset;\n"
1893                               "}\n");
1894 
1895     sourceCollections.glslSources.add("comp1")
1896         << glu::ComputeSource("#version 310 es\n"
1897                               "layout (local_size_x = 1) in;\n"
1898                               "layout(binding = 1) readonly buffer Input {\n"
1899                               "    uint values[];\n"
1900                               "};\n"
1901                               "layout(binding = 0) coherent buffer Output {\n"
1902                               "    uint sum;\n"
1903                               "};\n"
1904                               "void main (void) {\n"
1905                               "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
1906                               "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
1907                               "    uint value  = values[offset];\n"
1908                               "    atomicAdd(sum, value);\n"
1909                               "}\n");
1910 }
1911 
createInstance(Context & context) const1912 TestInstance *SSBOBarrierTest::createInstance(Context &context) const
1913 {
1914     return new SSBOBarrierTestInstance(context, m_workSize, m_computePipelineConstructionType);
1915 }
1916 
SSBOBarrierTestInstance(Context & context,const tcu::IVec3 & workSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)1917 SSBOBarrierTestInstance::SSBOBarrierTestInstance(
1918     Context &context, const tcu::IVec3 &workSize,
1919     const vk::ComputePipelineConstructionType computePipelineConstructionType)
1920     : TestInstance(context)
1921     , m_workSize(workSize)
1922     , m_computePipelineConstructionType(computePipelineConstructionType)
1923 {
1924 }
1925 
iterate(void)1926 tcu::TestStatus SSBOBarrierTestInstance::iterate(void)
1927 {
1928     const DeviceInterface &vk       = m_context.getDeviceInterface();
1929     const VkDevice device           = m_context.getDevice();
1930     const VkQueue queue             = m_context.getUniversalQueue();
1931     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
1932     Allocator &allocator            = m_context.getDefaultAllocator();
1933 
1934     // Create a work buffer used by both shaders
1935 
1936     const int workGroupCount               = multiplyComponents(m_workSize);
1937     const VkDeviceSize workBufferSizeBytes = sizeof(uint32_t) * workGroupCount;
1938     const BufferWithMemory workBuffer(vk, device, allocator,
1939                                       makeBufferCreateInfo(workBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1940                                       MemoryRequirement::Any);
1941 
1942     // Create an output buffer
1943 
1944     const VkDeviceSize outputBufferSizeBytes = sizeof(uint32_t);
1945     const BufferWithMemory outputBuffer(vk, device, allocator,
1946                                         makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
1947                                         MemoryRequirement::HostVisible);
1948 
1949     // Initialize atomic counter value to zero
1950     {
1951         const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
1952         uint32_t *outputBufferPtr                = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
1953         *outputBufferPtr                         = 0;
1954         flushAlloc(vk, device, outputBufferAllocation);
1955     }
1956 
1957     // Create a uniform buffer (to pass uniform constants)
1958 
1959     const VkDeviceSize uniformBufferSizeBytes = sizeof(uint32_t);
1960     const BufferWithMemory uniformBuffer(
1961         vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT),
1962         MemoryRequirement::HostVisible);
1963 
1964     // Set the constants in the uniform buffer
1965 
1966     const uint32_t baseValue = 127;
1967     {
1968         const Allocation &uniformBufferAllocation = uniformBuffer.getAllocation();
1969         uint32_t *uniformBufferPtr                = static_cast<uint32_t *>(uniformBufferAllocation.getHostPtr());
1970         uniformBufferPtr[0]                       = baseValue;
1971 
1972         flushAlloc(vk, device, uniformBufferAllocation);
1973     }
1974 
1975     // Create descriptor set
1976 
1977     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
1978         DescriptorSetLayoutBuilder()
1979             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1980             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1981             .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
1982             .build(vk, device));
1983 
1984     const Unique<VkDescriptorPool> descriptorPool(
1985         DescriptorPoolBuilder()
1986             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 2u)
1987             .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
1988             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
1989 
1990     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
1991 
1992     const VkDescriptorBufferInfo workBufferDescriptorInfo =
1993         makeDescriptorBufferInfo(*workBuffer, 0ull, workBufferSizeBytes);
1994     const VkDescriptorBufferInfo outputBufferDescriptorInfo =
1995         makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
1996     const VkDescriptorBufferInfo uniformBufferDescriptorInfo =
1997         makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
1998     DescriptorSetUpdateBuilder()
1999         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
2000                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
2001         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
2002                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &workBufferDescriptorInfo)
2003         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u),
2004                      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2005         .update(vk, device);
2006 
2007     // Perform the computation
2008 
2009     ComputePipelineWrapper pipeline0(vk, device, m_computePipelineConstructionType,
2010                                      m_context.getBinaryCollection().get("comp0"));
2011     pipeline0.setDescriptorSetLayout(descriptorSetLayout.get());
2012     pipeline0.buildPipeline();
2013 
2014     ComputePipelineWrapper pipeline1(vk, device, m_computePipelineConstructionType,
2015                                      m_context.getBinaryCollection().get("comp1"));
2016     pipeline1.setDescriptorSetLayout(descriptorSetLayout.get());
2017     pipeline1.buildPipeline();
2018 
2019     const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(
2020         VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2021 
2022     const VkBufferMemoryBarrier betweenShadersBarrier = makeBufferMemoryBarrier(
2023         VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *workBuffer, 0ull, workBufferSizeBytes);
2024 
2025     const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(
2026         VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
2027 
2028     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2029     const Unique<VkCommandBuffer> cmdBuffer(
2030         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2031 
2032     // Start recording commands
2033 
2034     beginCommandBuffer(vk, *cmdBuffer);
2035 
2036     pipeline0.bind(*cmdBuffer);
2037     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline0.getPipelineLayout(), 0u, 1u,
2038                              &descriptorSet.get(), 0u, DE_NULL);
2039 
2040     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2041                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &writeUniformConstantsBarrier,
2042                           0, (const VkImageMemoryBarrier *)DE_NULL);
2043 
2044     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
2045     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2046                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &betweenShadersBarrier, 0,
2047                           (const VkImageMemoryBarrier *)DE_NULL);
2048 
2049     // Switch to the second shader program
2050     pipeline1.bind(*cmdBuffer);
2051 
2052     vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
2053     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
2054                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &afterComputeBarrier, 0,
2055                           (const VkImageMemoryBarrier *)DE_NULL);
2056 
2057     endCommandBuffer(vk, *cmdBuffer);
2058 
2059     // Wait for completion
2060 
2061     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2062 
2063     // Validate the results
2064 
2065     const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
2066     invalidateAlloc(vk, device, outputBufferAllocation);
2067 
2068     const uint32_t *bufferPtr = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
2069     const uint32_t res        = *bufferPtr;
2070     uint32_t ref              = 0;
2071 
2072     for (int ndx = 0; ndx < workGroupCount; ++ndx)
2073         ref += baseValue + ndx;
2074 
2075     if (res != ref)
2076     {
2077         std::ostringstream msg;
2078         msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
2079         return tcu::TestStatus::fail(msg.str());
2080     }
2081     return tcu::TestStatus::pass("Compute succeeded");
2082 }
2083 
2084 class ImageAtomicOpTest : public vkt::TestCase
2085 {
2086 public:
2087     ImageAtomicOpTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t localSize,
2088                       const tcu::IVec2 &imageSize,
2089                       const vk::ComputePipelineConstructionType computePipelineConstructionType);
2090 
2091     virtual void checkSupport(Context &context) const;
2092     void initPrograms(SourceCollections &sourceCollections) const;
2093     TestInstance *createInstance(Context &context) const;
2094 
2095 private:
2096     const uint32_t m_localSize;
2097     const tcu::IVec2 m_imageSize;
2098     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2099 };
2100 
2101 class ImageAtomicOpTestInstance : public vkt::TestInstance
2102 {
2103 public:
2104     ImageAtomicOpTestInstance(Context &context, const uint32_t localSize, const tcu::IVec2 &imageSize,
2105                               const vk::ComputePipelineConstructionType computePipelineConstructionType);
2106 
2107     tcu::TestStatus iterate(void);
2108 
2109 private:
2110     const uint32_t m_localSize;
2111     const tcu::IVec2 m_imageSize;
2112     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2113 };
2114 
ImageAtomicOpTest(tcu::TestContext & testCtx,const std::string & name,const uint32_t localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2115 ImageAtomicOpTest::ImageAtomicOpTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t localSize,
2116                                      const tcu::IVec2 &imageSize,
2117                                      const vk::ComputePipelineConstructionType computePipelineConstructionType)
2118     : TestCase(testCtx, name)
2119     , m_localSize(localSize)
2120     , m_imageSize(imageSize)
2121     , m_computePipelineConstructionType(computePipelineConstructionType)
2122 {
2123 }
2124 
checkSupport(Context & context) const2125 void ImageAtomicOpTest::checkSupport(Context &context) const
2126 {
2127     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
2128                                   m_computePipelineConstructionType);
2129 }
2130 
initPrograms(SourceCollections & sourceCollections) const2131 void ImageAtomicOpTest::initPrograms(SourceCollections &sourceCollections) const
2132 {
2133     std::ostringstream src;
2134     src << "#version 310 es\n"
2135         << "#extension GL_OES_shader_image_atomic : require\n"
2136         << "layout (local_size_x = " << m_localSize << ") in;\n"
2137         << "layout(binding = 1, r32ui) coherent uniform highp uimage2D u_dstImg;\n"
2138         << "layout(binding = 0) readonly buffer Input {\n"
2139         << "    uint values[" << (multiplyComponents(m_imageSize) * m_localSize) << "];\n"
2140         << "} sb_in;\n\n"
2141         << "void main (void) {\n"
2142         << "    uint stride = gl_NumWorkGroups.x*gl_WorkGroupSize.x;\n"
2143         << "    uint value  = sb_in.values[gl_GlobalInvocationID.y*stride + gl_GlobalInvocationID.x];\n"
2144         << "\n"
2145         << "    if (gl_LocalInvocationIndex == 0u)\n"
2146         << "        imageStore(u_dstImg, ivec2(gl_WorkGroupID.xy), uvec4(0));\n"
2147         << "    memoryBarrierImage();\n"
2148         << "    barrier();\n"
2149         << "    imageAtomicAdd(u_dstImg, ivec2(gl_WorkGroupID.xy), value);\n"
2150         << "}\n";
2151 
2152     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2153 }
2154 
createInstance(Context & context) const2155 TestInstance *ImageAtomicOpTest::createInstance(Context &context) const
2156 {
2157     return new ImageAtomicOpTestInstance(context, m_localSize, m_imageSize, m_computePipelineConstructionType);
2158 }
2159 
ImageAtomicOpTestInstance(Context & context,const uint32_t localSize,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2160 ImageAtomicOpTestInstance::ImageAtomicOpTestInstance(
2161     Context &context, const uint32_t localSize, const tcu::IVec2 &imageSize,
2162     const vk::ComputePipelineConstructionType computePipelineConstructionType)
2163     : TestInstance(context)
2164     , m_localSize(localSize)
2165     , m_imageSize(imageSize)
2166     , m_computePipelineConstructionType(computePipelineConstructionType)
2167 {
2168 }
2169 
iterate(void)2170 tcu::TestStatus ImageAtomicOpTestInstance::iterate(void)
2171 {
2172     const DeviceInterface &vk       = m_context.getDeviceInterface();
2173     const VkDevice device           = m_context.getDevice();
2174     const VkQueue queue             = m_context.getUniversalQueue();
2175     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
2176     Allocator &allocator            = m_context.getDefaultAllocator();
2177 
2178     // Create an image
2179 
2180     const VkImageCreateInfo imageParams =
2181         make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT);
2182     const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2183 
2184     const VkImageSubresourceRange subresourceRange =
2185         makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2186     const Unique<VkImageView> imageView(
2187         makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2188 
2189     // Input buffer
2190 
2191     const uint32_t numInputValues           = multiplyComponents(m_imageSize) * m_localSize;
2192     const VkDeviceSize inputBufferSizeBytes = sizeof(uint32_t) * numInputValues;
2193 
2194     const BufferWithMemory inputBuffer(vk, device, allocator,
2195                                        makeBufferCreateInfo(inputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
2196                                        MemoryRequirement::HostVisible);
2197 
2198     // Populate the input buffer with test data
2199     {
2200         de::Random rnd(0x77238ac2);
2201         const Allocation &inputBufferAllocation = inputBuffer.getAllocation();
2202         uint32_t *bufferPtr                     = static_cast<uint32_t *>(inputBufferAllocation.getHostPtr());
2203         for (uint32_t i = 0; i < numInputValues; ++i)
2204             *bufferPtr++ = rnd.getUint32();
2205 
2206         flushAlloc(vk, device, inputBufferAllocation);
2207     }
2208 
2209     // Create a buffer to store shader output (copied from image data)
2210 
2211     const uint32_t imageArea                 = multiplyComponents(m_imageSize);
2212     const VkDeviceSize outputBufferSizeBytes = sizeof(uint32_t) * imageArea;
2213     const BufferWithMemory outputBuffer(vk, device, allocator,
2214                                         makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT),
2215                                         MemoryRequirement::HostVisible);
2216 
2217     // Create descriptor set
2218 
2219     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2220         DescriptorSetLayoutBuilder()
2221             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2222             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2223             .build(vk, device));
2224 
2225     const Unique<VkDescriptorPool> descriptorPool(
2226         DescriptorPoolBuilder()
2227             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2228             .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2229             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2230 
2231     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2232 
2233     // Set the bindings
2234 
2235     const VkDescriptorImageInfo imageDescriptorInfo =
2236         makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2237     const VkDescriptorBufferInfo bufferDescriptorInfo =
2238         makeDescriptorBufferInfo(*inputBuffer, 0ull, inputBufferSizeBytes);
2239 
2240     DescriptorSetUpdateBuilder()
2241         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
2242                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2243         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
2244                      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2245         .update(vk, device);
2246 
2247     // Perform the computation
2248     {
2249         ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
2250                                         m_context.getBinaryCollection().get("comp"));
2251         pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
2252         pipeline.buildPipeline();
2253 
2254         const VkBufferMemoryBarrier inputBufferPostHostWriteBarrier = makeBufferMemoryBarrier(
2255             VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *inputBuffer, 0ull, inputBufferSizeBytes);
2256 
2257         const VkImageMemoryBarrier imageLayoutBarrier =
2258             makeImageMemoryBarrier((VkAccessFlags)0, VK_ACCESS_SHADER_WRITE_BIT, VK_IMAGE_LAYOUT_UNDEFINED,
2259                                    VK_IMAGE_LAYOUT_GENERAL, *image, subresourceRange);
2260 
2261         // Prepare the command buffer
2262 
2263         const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2264         const Unique<VkCommandBuffer> cmdBuffer(
2265             allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2266 
2267         // Start recording commands
2268 
2269         beginCommandBuffer(vk, *cmdBuffer);
2270 
2271         pipeline.bind(*cmdBuffer);
2272         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
2273                                  &descriptorSet.get(), 0u, DE_NULL);
2274 
2275         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2276                               (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1,
2277                               &inputBufferPostHostWriteBarrier, 1, &imageLayoutBarrier);
2278         vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2279 
2280         copyImageToBuffer(vk, *cmdBuffer, *image, *outputBuffer, m_imageSize, VK_ACCESS_SHADER_WRITE_BIT,
2281                           VK_IMAGE_LAYOUT_GENERAL);
2282 
2283         endCommandBuffer(vk, *cmdBuffer);
2284 
2285         // Wait for completion
2286 
2287         submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2288     }
2289 
2290     // Validate the results
2291 
2292     const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
2293     invalidateAlloc(vk, device, outputBufferAllocation);
2294 
2295     const uint32_t *bufferPtr    = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
2296     const uint32_t *refBufferPtr = static_cast<uint32_t *>(inputBuffer.getAllocation().getHostPtr());
2297 
2298     for (uint32_t pixelNdx = 0; pixelNdx < imageArea; ++pixelNdx)
2299     {
2300         const uint32_t res = bufferPtr[pixelNdx];
2301         uint32_t ref       = 0;
2302 
2303         for (uint32_t offs = 0; offs < m_localSize; ++offs)
2304             ref += refBufferPtr[pixelNdx * m_localSize + offs];
2305 
2306         if (res != ref)
2307         {
2308             std::ostringstream msg;
2309             msg << "Comparison failed for pixel " << pixelNdx;
2310             return tcu::TestStatus::fail(msg.str());
2311         }
2312     }
2313     return tcu::TestStatus::pass("Compute succeeded");
2314 }
2315 
2316 class ImageBarrierTest : public vkt::TestCase
2317 {
2318 public:
2319     ImageBarrierTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec2 &imageSize,
2320                      const vk::ComputePipelineConstructionType computePipelineConstructionType);
2321 
2322     virtual void checkSupport(Context &context) const;
2323     void initPrograms(SourceCollections &sourceCollections) const;
2324     TestInstance *createInstance(Context &context) const;
2325 
2326 private:
2327     const tcu::IVec2 m_imageSize;
2328     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2329 };
2330 
2331 class ImageBarrierTestInstance : public vkt::TestInstance
2332 {
2333 public:
2334     ImageBarrierTestInstance(Context &context, const tcu::IVec2 &imageSize,
2335                              const vk::ComputePipelineConstructionType computePipelineConstructionType);
2336 
2337     tcu::TestStatus iterate(void);
2338 
2339 private:
2340     const tcu::IVec2 m_imageSize;
2341     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2342 };
2343 
ImageBarrierTest(tcu::TestContext & testCtx,const std::string & name,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2344 ImageBarrierTest::ImageBarrierTest(tcu::TestContext &testCtx, const std::string &name, const tcu::IVec2 &imageSize,
2345                                    const vk::ComputePipelineConstructionType computePipelineConstructionType)
2346     : TestCase(testCtx, name)
2347     , m_imageSize(imageSize)
2348     , m_computePipelineConstructionType(computePipelineConstructionType)
2349 {
2350 }
2351 
checkSupport(Context & context) const2352 void ImageBarrierTest::checkSupport(Context &context) const
2353 {
2354     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
2355                                   m_computePipelineConstructionType);
2356 }
2357 
initPrograms(SourceCollections & sourceCollections) const2358 void ImageBarrierTest::initPrograms(SourceCollections &sourceCollections) const
2359 {
2360     sourceCollections.glslSources.add("comp0")
2361         << glu::ComputeSource("#version 310 es\n"
2362                               "layout (local_size_x = 1) in;\n"
2363                               "layout(binding = 2) readonly uniform Constants {\n"
2364                               "    uint u_baseVal;\n"
2365                               "};\n"
2366                               "layout(binding = 1, r32ui) writeonly uniform highp uimage2D u_img;\n"
2367                               "void main (void) {\n"
2368                               "    uint offset = gl_NumWorkGroups.x*gl_NumWorkGroups.y*gl_WorkGroupID.z + "
2369                               "gl_NumWorkGroups.x*gl_WorkGroupID.y + gl_WorkGroupID.x;\n"
2370                               "    imageStore(u_img, ivec2(gl_WorkGroupID.xy), uvec4(offset + u_baseVal, 0, 0, 0));\n"
2371                               "}\n");
2372 
2373     sourceCollections.glslSources.add("comp1")
2374         << glu::ComputeSource("#version 310 es\n"
2375                               "layout (local_size_x = 1) in;\n"
2376                               "layout(binding = 1, r32ui) readonly uniform highp uimage2D u_img;\n"
2377                               "layout(binding = 0) coherent buffer Output {\n"
2378                               "    uint sum;\n"
2379                               "};\n"
2380                               "void main (void) {\n"
2381                               "    uint value = imageLoad(u_img, ivec2(gl_WorkGroupID.xy)).x;\n"
2382                               "    atomicAdd(sum, value);\n"
2383                               "}\n");
2384 }
2385 
createInstance(Context & context) const2386 TestInstance *ImageBarrierTest::createInstance(Context &context) const
2387 {
2388     return new ImageBarrierTestInstance(context, m_imageSize, m_computePipelineConstructionType);
2389 }
2390 
ImageBarrierTestInstance(Context & context,const tcu::IVec2 & imageSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)2391 ImageBarrierTestInstance::ImageBarrierTestInstance(
2392     Context &context, const tcu::IVec2 &imageSize,
2393     const vk::ComputePipelineConstructionType computePipelineConstructionType)
2394     : TestInstance(context)
2395     , m_imageSize(imageSize)
2396     , m_computePipelineConstructionType(computePipelineConstructionType)
2397 {
2398 }
2399 
iterate(void)2400 tcu::TestStatus ImageBarrierTestInstance::iterate(void)
2401 {
2402     const DeviceInterface &vk       = m_context.getDeviceInterface();
2403     const VkDevice device           = m_context.getDevice();
2404     const VkQueue queue             = m_context.getUniversalQueue();
2405     const uint32_t queueFamilyIndex = m_context.getUniversalQueueFamilyIndex();
2406     Allocator &allocator            = m_context.getDefaultAllocator();
2407 
2408     // Create an image used by both shaders
2409 
2410     const VkImageCreateInfo imageParams = make2DImageCreateInfo(m_imageSize, VK_IMAGE_USAGE_STORAGE_BIT);
2411     const ImageWithMemory image(vk, device, allocator, imageParams, MemoryRequirement::Any);
2412 
2413     const VkImageSubresourceRange subresourceRange =
2414         makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
2415     const Unique<VkImageView> imageView(
2416         makeImageView(vk, device, *image, VK_IMAGE_VIEW_TYPE_2D, VK_FORMAT_R32_UINT, subresourceRange));
2417 
2418     // Create an output buffer
2419 
2420     const VkDeviceSize outputBufferSizeBytes = sizeof(uint32_t);
2421     const BufferWithMemory outputBuffer(vk, device, allocator,
2422                                         makeBufferCreateInfo(outputBufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
2423                                         MemoryRequirement::HostVisible);
2424 
2425     // Initialize atomic counter value to zero
2426     {
2427         const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
2428         uint32_t *outputBufferPtr                = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
2429         *outputBufferPtr                         = 0;
2430         flushAlloc(vk, device, outputBufferAllocation);
2431     }
2432 
2433     // Create a uniform buffer (to pass uniform constants)
2434 
2435     const VkDeviceSize uniformBufferSizeBytes = sizeof(uint32_t);
2436     const BufferWithMemory uniformBuffer(
2437         vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT),
2438         MemoryRequirement::HostVisible);
2439 
2440     // Set the constants in the uniform buffer
2441 
2442     const uint32_t baseValue = 127;
2443     {
2444         const Allocation &uniformBufferAllocation = uniformBuffer.getAllocation();
2445         uint32_t *uniformBufferPtr                = static_cast<uint32_t *>(uniformBufferAllocation.getHostPtr());
2446         uniformBufferPtr[0]                       = baseValue;
2447 
2448         flushAlloc(vk, device, uniformBufferAllocation);
2449     }
2450 
2451     // Create descriptor set
2452 
2453     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2454         DescriptorSetLayoutBuilder()
2455             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2456             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_SHADER_STAGE_COMPUTE_BIT)
2457             .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2458             .build(vk, device));
2459 
2460     const Unique<VkDescriptorPool> descriptorPool(
2461         DescriptorPoolBuilder()
2462             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2463             .addType(VK_DESCRIPTOR_TYPE_STORAGE_IMAGE)
2464             .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2465             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2466 
2467     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2468 
2469     const VkDescriptorImageInfo imageDescriptorInfo =
2470         makeDescriptorImageInfo(DE_NULL, *imageView, VK_IMAGE_LAYOUT_GENERAL);
2471     const VkDescriptorBufferInfo outputBufferDescriptorInfo =
2472         makeDescriptorBufferInfo(*outputBuffer, 0ull, outputBufferSizeBytes);
2473     const VkDescriptorBufferInfo uniformBufferDescriptorInfo =
2474         makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2475     DescriptorSetUpdateBuilder()
2476         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
2477                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &outputBufferDescriptorInfo)
2478         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
2479                      VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &imageDescriptorInfo)
2480         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(2u),
2481                      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2482         .update(vk, device);
2483 
2484     // Perform the computation
2485 
2486     ComputePipelineWrapper pipeline0(vk, device, m_computePipelineConstructionType,
2487                                      m_context.getBinaryCollection().get("comp0"));
2488     pipeline0.setDescriptorSetLayout(descriptorSetLayout.get());
2489     pipeline0.buildPipeline();
2490     ComputePipelineWrapper pipeline1(vk, device, m_computePipelineConstructionType,
2491                                      m_context.getBinaryCollection().get("comp1"));
2492     pipeline1.setDescriptorSetLayout(descriptorSetLayout.get());
2493     pipeline1.buildPipeline();
2494 
2495     const VkBufferMemoryBarrier writeUniformConstantsBarrier = makeBufferMemoryBarrier(
2496         VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2497 
2498     const VkImageMemoryBarrier imageLayoutBarrier =
2499         makeImageMemoryBarrier(0u, 0u, VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_GENERAL, *image, subresourceRange);
2500 
2501     const VkImageMemoryBarrier imageBarrierBetweenShaders =
2502         makeImageMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, VK_IMAGE_LAYOUT_GENERAL,
2503                                VK_IMAGE_LAYOUT_GENERAL, *image, subresourceRange);
2504 
2505     const VkBufferMemoryBarrier afterComputeBarrier = makeBufferMemoryBarrier(
2506         VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *outputBuffer, 0ull, outputBufferSizeBytes);
2507 
2508     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
2509     const Unique<VkCommandBuffer> cmdBuffer(
2510         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2511 
2512     // Start recording commands
2513 
2514     beginCommandBuffer(vk, *cmdBuffer);
2515 
2516     pipeline0.bind(*cmdBuffer);
2517     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline0.getPipelineLayout(), 0u, 1u,
2518                              &descriptorSet.get(), 0u, DE_NULL);
2519 
2520     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2521                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &writeUniformConstantsBarrier,
2522                           1, &imageLayoutBarrier);
2523 
2524     vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2525     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
2526                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 0,
2527                           (const VkBufferMemoryBarrier *)DE_NULL, 1, &imageBarrierBetweenShaders);
2528 
2529     // Switch to the second shader program
2530     pipeline1.bind(*cmdBuffer);
2531 
2532     vk.cmdDispatch(*cmdBuffer, m_imageSize.x(), m_imageSize.y(), 1u);
2533     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
2534                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &afterComputeBarrier, 0,
2535                           (const VkImageMemoryBarrier *)DE_NULL);
2536 
2537     endCommandBuffer(vk, *cmdBuffer);
2538 
2539     // Wait for completion
2540 
2541     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
2542 
2543     // Validate the results
2544 
2545     const Allocation &outputBufferAllocation = outputBuffer.getAllocation();
2546     invalidateAlloc(vk, device, outputBufferAllocation);
2547 
2548     const int numValues       = multiplyComponents(m_imageSize);
2549     const uint32_t *bufferPtr = static_cast<uint32_t *>(outputBufferAllocation.getHostPtr());
2550     const uint32_t res        = *bufferPtr;
2551     uint32_t ref              = 0;
2552 
2553     for (int ndx = 0; ndx < numValues; ++ndx)
2554         ref += baseValue + ndx;
2555 
2556     if (res != ref)
2557     {
2558         std::ostringstream msg;
2559         msg << "ERROR: comparison failed, expected " << ref << ", got " << res;
2560         return tcu::TestStatus::fail(msg.str());
2561     }
2562     return tcu::TestStatus::pass("Compute succeeded");
2563 }
2564 
2565 class ComputeTestInstance : public vkt::TestInstance
2566 {
2567 public:
ComputeTestInstance(Context & context,vk::ComputePipelineConstructionType computePipelineConstructionType,bool useMaintenance5)2568     ComputeTestInstance(Context &context, vk::ComputePipelineConstructionType computePipelineConstructionType,
2569                         bool useMaintenance5)
2570         : TestInstance(context)
2571         , m_numPhysDevices(1)
2572         , m_queueFamilyIndex(0)
2573         , m_computePipelineConstructionType(computePipelineConstructionType)
2574         , m_maintenance5(useMaintenance5)
2575     {
2576         createDeviceGroup();
2577     }
2578 
~ComputeTestInstance()2579     ~ComputeTestInstance()
2580     {
2581     }
2582 
2583     void createDeviceGroup(void);
getDeviceInterface(void)2584     const vk::DeviceInterface &getDeviceInterface(void)
2585     {
2586         return *m_deviceDriver;
2587     }
getInstance(void)2588     vk::VkInstance getInstance(void)
2589     {
2590         return m_deviceGroupInstance;
2591     }
getDevice(void)2592     vk::VkDevice getDevice(void)
2593     {
2594         return *m_logicalDevice;
2595     }
getPhysicalDevice(uint32_t i=0)2596     vk::VkPhysicalDevice getPhysicalDevice(uint32_t i = 0)
2597     {
2598         return m_physicalDevices[i];
2599     }
2600 
2601 protected:
2602     uint32_t m_numPhysDevices;
2603     uint32_t m_queueFamilyIndex;
2604     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2605     bool m_maintenance5;
2606 
2607 private:
2608     CustomInstance m_deviceGroupInstance;
2609     vk::Move<vk::VkDevice> m_logicalDevice;
2610     std::vector<vk::VkPhysicalDevice> m_physicalDevices;
2611 #ifndef CTS_USES_VULKANSC
2612     de::MovePtr<vk::DeviceDriver> m_deviceDriver;
2613 #else
2614     de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter> m_deviceDriver;
2615 #endif // CTS_USES_VULKANSC
2616 };
2617 
createDeviceGroup(void)2618 void ComputeTestInstance::createDeviceGroup(void)
2619 {
2620     const tcu::CommandLine &cmdLine = m_context.getTestContext().getCommandLine();
2621     const uint32_t devGroupIdx      = cmdLine.getVKDeviceGroupId() - 1;
2622     const uint32_t physDeviceIdx    = cmdLine.getVKDeviceId() - 1;
2623     const float queuePriority       = 1.0f;
2624     const std::vector<std::string> requiredExtensions(1, "VK_KHR_device_group_creation");
2625     m_deviceGroupInstance = createCustomInstanceWithExtensions(m_context, requiredExtensions);
2626     std::vector<VkPhysicalDeviceGroupProperties> devGroupProperties =
2627         enumeratePhysicalDeviceGroups(m_context.getInstanceInterface(), m_deviceGroupInstance);
2628     m_numPhysDevices = devGroupProperties[devGroupIdx].physicalDeviceCount;
2629     std::vector<const char *> deviceExtensions;
2630 
2631     if (!isCoreDeviceExtension(m_context.getUsedApiVersion(), "VK_KHR_device_group"))
2632         deviceExtensions.push_back("VK_KHR_device_group");
2633 
2634     if (m_maintenance5)
2635         deviceExtensions.push_back("VK_KHR_maintenance5");
2636 
2637     //m_ma
2638 
2639     VkDeviceGroupDeviceCreateInfo deviceGroupInfo = {
2640         VK_STRUCTURE_TYPE_DEVICE_GROUP_DEVICE_CREATE_INFO,   //stype
2641         DE_NULL,                                             //pNext
2642         devGroupProperties[devGroupIdx].physicalDeviceCount, //physicalDeviceCount
2643         devGroupProperties[devGroupIdx].physicalDevices      //physicalDevices
2644     };
2645     const InstanceDriver &instance(m_deviceGroupInstance.getDriver());
2646     VkPhysicalDeviceFeatures2 deviceFeatures2 = initVulkanStructure();
2647     const VkPhysicalDeviceFeatures deviceFeatures =
2648         getPhysicalDeviceFeatures(instance, deviceGroupInfo.pPhysicalDevices[physDeviceIdx]);
2649     const std::vector<VkQueueFamilyProperties> queueProps = getPhysicalDeviceQueueFamilyProperties(
2650         instance, devGroupProperties[devGroupIdx].physicalDevices[physDeviceIdx]);
2651 
2652     deviceFeatures2.features = deviceFeatures;
2653 
2654 #ifndef CTS_USES_VULKANSC
2655     VkPhysicalDeviceDynamicRenderingFeaturesKHR dynamicRenderingFeatures = initVulkanStructure();
2656     dynamicRenderingFeatures.dynamicRendering                            = VK_TRUE;
2657     VkPhysicalDeviceShaderObjectFeaturesEXT shaderObjectFeatures = initVulkanStructure(&dynamicRenderingFeatures);
2658     shaderObjectFeatures.shaderObject                            = VK_TRUE;
2659     if (m_computePipelineConstructionType)
2660     {
2661         deviceExtensions.push_back("VK_EXT_shader_object");
2662         deviceFeatures2.pNext = &shaderObjectFeatures;
2663     }
2664 #endif
2665 
2666     m_physicalDevices.resize(m_numPhysDevices);
2667     for (uint32_t physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
2668         m_physicalDevices[physDevIdx] = devGroupProperties[devGroupIdx].physicalDevices[physDevIdx];
2669 
2670     for (size_t queueNdx = 0; queueNdx < queueProps.size(); queueNdx++)
2671     {
2672         if (queueProps[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
2673             m_queueFamilyIndex = (uint32_t)queueNdx;
2674     }
2675 
2676     VkDeviceQueueCreateInfo queueInfo = {
2677         VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, // VkStructureType sType;
2678         DE_NULL,                                    // const void* pNext;
2679         (VkDeviceQueueCreateFlags)0u,               // VkDeviceQueueCreateFlags flags;
2680         m_queueFamilyIndex,                         // uint32_t queueFamilyIndex;
2681         1u,                                         // uint32_t queueCount;
2682         &queuePriority                              // const float* pQueuePriorities;
2683     };
2684 
2685     void *pNext = &deviceGroupInfo;
2686     if (deviceFeatures2.pNext != DE_NULL)
2687         deviceGroupInfo.pNext = &deviceFeatures2;
2688 
2689 #ifdef CTS_USES_VULKANSC
2690     VkDeviceObjectReservationCreateInfo memReservationInfo = cmdLine.isSubProcess() ?
2691                                                                  m_context.getResourceInterface()->getStatMax() :
2692                                                                  resetDeviceObjectReservationCreateInfo();
2693     memReservationInfo.pNext                               = pNext;
2694     pNext                                                  = &memReservationInfo;
2695 
2696     VkPhysicalDeviceVulkanSC10Features sc10Features = createDefaultSC10Features();
2697     sc10Features.pNext                              = pNext;
2698     pNext                                           = &sc10Features;
2699     VkPipelineCacheCreateInfo pcCI;
2700     std::vector<VkPipelinePoolSize> poolSizes;
2701     if (cmdLine.isSubProcess())
2702     {
2703         if (m_context.getResourceInterface()->getCacheDataSize() > 0)
2704         {
2705             pcCI = {
2706                 VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // VkStructureType sType;
2707                 DE_NULL,                                      // const void* pNext;
2708                 VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
2709                     VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT, // VkPipelineCacheCreateFlags flags;
2710                 m_context.getResourceInterface()->getCacheDataSize(),     // uintptr_t initialDataSize;
2711                 m_context.getResourceInterface()->getCacheData()          // const void* pInitialData;
2712             };
2713             memReservationInfo.pipelineCacheCreateInfoCount = 1;
2714             memReservationInfo.pPipelineCacheCreateInfos    = &pcCI;
2715         }
2716 
2717         poolSizes = m_context.getResourceInterface()->getPipelinePoolSizes();
2718         if (!poolSizes.empty())
2719         {
2720             memReservationInfo.pipelinePoolSizeCount = uint32_t(poolSizes.size());
2721             memReservationInfo.pPipelinePoolSizes    = poolSizes.data();
2722         }
2723     }
2724 
2725 #endif // CTS_USES_VULKANSC
2726 
2727     const VkDeviceCreateInfo deviceInfo = {
2728         VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // VkStructureType sType;
2729         pNext,                                // const void* pNext;
2730         (VkDeviceCreateFlags)0,               // VkDeviceCreateFlags flags;
2731         1u,                                   // uint32_t queueCreateInfoCount;
2732         &queueInfo,                           // const VkDeviceQueueCreateInfo* pQueueCreateInfos;
2733         0u,                                   // uint32_t enabledLayerCount;
2734         DE_NULL,                              // const char* const* ppEnabledLayerNames;
2735         uint32_t(deviceExtensions.size()),    // uint32_t enabledExtensionCount;
2736         (deviceExtensions.empty() ? DE_NULL : &deviceExtensions[0]), // const char* const* ppEnabledExtensionNames;
2737         deviceFeatures2.pNext == DE_NULL ? &deviceFeatures :
2738                                            DE_NULL, // const VkPhysicalDeviceFeatures* pEnabledFeatures;
2739     };
2740 
2741     m_logicalDevice = createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(),
2742                                          m_context.getPlatformInterface(), m_deviceGroupInstance, instance,
2743                                          deviceGroupInfo.pPhysicalDevices[physDeviceIdx], &deviceInfo);
2744 #ifndef CTS_USES_VULKANSC
2745     m_deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_deviceGroupInstance,
2746                                                                 *m_logicalDevice, m_context.getUsedApiVersion(),
2747                                                                 m_context.getTestContext().getCommandLine()));
2748 #else
2749     m_deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(
2750         new DeviceDriverSC(m_context.getPlatformInterface(), m_context.getInstance(), *m_logicalDevice,
2751                            m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(),
2752                            m_context.getDeviceVulkanSC10Properties(), m_context.getDeviceProperties(),
2753                            m_context.getUsedApiVersion()),
2754         vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *m_logicalDevice));
2755 #endif // CTS_USES_VULKANSC
2756 }
2757 
2758 class DispatchBaseTest : public vkt::TestCase
2759 {
2760 public:
2761     DispatchBaseTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
2762                      const tcu::IVec3 &localsize, const tcu::IVec3 &worksize, const tcu::IVec3 &splitsize,
2763                      const vk::ComputePipelineConstructionType computePipelineConstructionType,
2764                      const bool useMaintenance5);
2765 
2766     virtual void checkSupport(Context &context) const;
2767     void initPrograms(SourceCollections &sourceCollections) const;
2768     TestInstance *createInstance(Context &context) const;
2769 
2770 private:
2771     const uint32_t m_numValues;
2772     const tcu::IVec3 m_localSize;
2773     const tcu::IVec3 m_workSize;
2774     const tcu::IVec3 m_splitSize;
2775     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
2776     const bool m_useMaintenance5;
2777 };
2778 
2779 class DispatchBaseTestInstance : public ComputeTestInstance
2780 {
2781 public:
2782     DispatchBaseTestInstance(Context &context, const uint32_t numValues, const tcu::IVec3 &localsize,
2783                              const tcu::IVec3 &worksize, const tcu::IVec3 &splitsize,
2784                              const vk::ComputePipelineConstructionType computePipelineConstructionType,
2785                              const bool useMaintenance5);
2786 
2787     bool isInputVectorValid(const tcu::IVec3 &small, const tcu::IVec3 &big);
2788     tcu::TestStatus iterate(void);
2789 
2790 private:
2791     const uint32_t m_numValues;
2792     const tcu::IVec3 m_localSize;
2793     const tcu::IVec3 m_workSize;
2794     const tcu::IVec3 m_splitWorkSize;
2795     const bool m_useMaintenance5;
2796 };
2797 
DispatchBaseTest(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize,const vk::ComputePipelineConstructionType computePipelineConstructionType,const bool useMaintenance5)2798 DispatchBaseTest::DispatchBaseTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
2799                                    const tcu::IVec3 &localsize, const tcu::IVec3 &worksize, const tcu::IVec3 &splitsize,
2800                                    const vk::ComputePipelineConstructionType computePipelineConstructionType,
2801                                    const bool useMaintenance5)
2802     : TestCase(testCtx, name)
2803     , m_numValues(numValues)
2804     , m_localSize(localsize)
2805     , m_workSize(worksize)
2806     , m_splitSize(splitsize)
2807     , m_computePipelineConstructionType(computePipelineConstructionType)
2808     , m_useMaintenance5(useMaintenance5)
2809 {
2810 }
2811 
checkSupport(Context & context) const2812 void DispatchBaseTest::checkSupport(Context &context) const
2813 {
2814     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
2815                                   m_computePipelineConstructionType);
2816     if (m_useMaintenance5)
2817         context.requireDeviceFunctionality("VK_KHR_maintenance5");
2818 }
2819 
initPrograms(SourceCollections & sourceCollections) const2820 void DispatchBaseTest::initPrograms(SourceCollections &sourceCollections) const
2821 {
2822     std::ostringstream src;
2823     src << "#version 310 es\n"
2824         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
2825         << ", local_size_z = " << m_localSize.z() << ") in;\n"
2826 
2827         << "layout(binding = 0) buffer InOut {\n"
2828         << "    uint values[" << de::toString(m_numValues) << "];\n"
2829         << "} sb_inout;\n"
2830 
2831         << "layout(binding = 1) readonly uniform uniformInput {\n"
2832         << "    uvec3 gridSize;\n"
2833         << "} ubo_in;\n"
2834 
2835         << "void main (void) {\n"
2836         << "    uvec3 size = ubo_in.gridSize * gl_WorkGroupSize;\n"
2837         << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
2838         << "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
2839            "gl_GlobalInvocationID.x;\n"
2840         << "    uint offset = numValuesPerInv*index;\n"
2841         << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
2842         << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
2843         << "}\n";
2844 
2845     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
2846 }
2847 
createInstance(Context & context) const2848 TestInstance *DispatchBaseTest::createInstance(Context &context) const
2849 {
2850     return new DispatchBaseTestInstance(context, m_numValues, m_localSize, m_workSize, m_splitSize,
2851                                         m_computePipelineConstructionType, m_useMaintenance5);
2852 }
2853 
DispatchBaseTestInstance(Context & context,const uint32_t numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const tcu::IVec3 & splitsize,const vk::ComputePipelineConstructionType computePipelineConstructionType,const bool useMaintenance5)2854 DispatchBaseTestInstance::DispatchBaseTestInstance(
2855     Context &context, const uint32_t numValues, const tcu::IVec3 &localsize, const tcu::IVec3 &worksize,
2856     const tcu::IVec3 &splitsize, const vk::ComputePipelineConstructionType computePipelineConstructionType,
2857     const bool useMaintenance5)
2858 
2859     : ComputeTestInstance(context, computePipelineConstructionType, useMaintenance5)
2860     , m_numValues(numValues)
2861     , m_localSize(localsize)
2862     , m_workSize(worksize)
2863     , m_splitWorkSize(splitsize)
2864     , m_useMaintenance5(useMaintenance5)
2865 {
2866     // For easy work distribution across physical devices:
2867     // WorkSize should be a multiple of SplitWorkSize only in the X component
2868     if ((!isInputVectorValid(m_splitWorkSize, m_workSize)) || (m_workSize.x() <= m_splitWorkSize.x()) ||
2869         (m_workSize.y() != m_splitWorkSize.y()) || (m_workSize.z() != m_splitWorkSize.z()))
2870         TCU_THROW(TestError, "Invalid Input.");
2871 
2872     // For easy work distribution within the same physical device:
2873     // SplitWorkSize should be a multiple of localSize in Y or Z component
2874     if ((!isInputVectorValid(m_localSize, m_splitWorkSize)) || (m_localSize.x() != m_splitWorkSize.x()) ||
2875         (m_localSize.y() >= m_splitWorkSize.y()) || (m_localSize.z() >= m_splitWorkSize.z()))
2876         TCU_THROW(TestError, "Invalid Input.");
2877 
2878     if ((multiplyComponents(m_workSize) / multiplyComponents(m_splitWorkSize)) < (int32_t)m_numPhysDevices)
2879         TCU_THROW(TestError, "Not enough work to distribute across all physical devices.");
2880 
2881     uint32_t totalWork = multiplyComponents(m_workSize) * multiplyComponents(m_localSize);
2882     if ((totalWork > numValues) || (numValues % totalWork != 0))
2883         TCU_THROW(TestError, "Buffer too small/not aligned to cover all values.");
2884 }
2885 
isInputVectorValid(const tcu::IVec3 & small,const tcu::IVec3 & big)2886 bool DispatchBaseTestInstance::isInputVectorValid(const tcu::IVec3 &small, const tcu::IVec3 &big)
2887 {
2888     if (((big.x() < small.x()) || (big.y() < small.y()) || (big.z() < small.z())) ||
2889         ((big.x() % small.x() != 0) || (big.y() % small.y() != 0) || (big.z() % small.z() != 0)))
2890         return false;
2891     return true;
2892 }
2893 
iterate(void)2894 tcu::TestStatus DispatchBaseTestInstance::iterate(void)
2895 {
2896     const DeviceInterface &vk = getDeviceInterface();
2897     const VkDevice device     = getDevice();
2898     const VkQueue queue       = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
2899     SimpleAllocator allocator(vk, device,
2900                               getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
2901     uint32_t totalWorkloadSize = 0;
2902 
2903     // Create an uniform and input/output buffer
2904     const uint32_t uniformBufSize             = 3; // Pass the compute grid size
2905     const VkDeviceSize uniformBufferSizeBytes = sizeof(uint32_t) * uniformBufSize;
2906     const BufferWithMemory uniformBuffer(
2907         vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT),
2908         MemoryRequirement::HostVisible);
2909 
2910     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * m_numValues;
2911     const BufferWithMemory buffer(vk, device, allocator,
2912                                   makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
2913                                   MemoryRequirement::HostVisible);
2914 
2915     // Fill the buffers with data
2916     typedef std::vector<uint32_t> data_vector_t;
2917     data_vector_t uniformInputData(uniformBufSize);
2918     data_vector_t inputData(m_numValues);
2919 
2920     {
2921         const Allocation &bufferAllocation = uniformBuffer.getAllocation();
2922         uint32_t *bufferPtr                = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
2923         uniformInputData[0] = *bufferPtr++ = m_workSize.x();
2924         uniformInputData[1] = *bufferPtr++ = m_workSize.y();
2925         uniformInputData[2] = *bufferPtr++ = m_workSize.z();
2926         flushAlloc(vk, device, bufferAllocation);
2927     }
2928 
2929     {
2930         de::Random rnd(0x82ce7f);
2931         const Allocation &bufferAllocation = buffer.getAllocation();
2932         uint32_t *bufferPtr                = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
2933         for (uint32_t i = 0; i < m_numValues; ++i)
2934             inputData[i] = *bufferPtr++ = rnd.getUint32();
2935 
2936         flushAlloc(vk, device, bufferAllocation);
2937     }
2938 
2939     // Create descriptor set
2940     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
2941         DescriptorSetLayoutBuilder()
2942             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2943             .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
2944             .build(vk, device));
2945 
2946     const Unique<VkDescriptorPool> descriptorPool(
2947         DescriptorPoolBuilder()
2948             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
2949             .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
2950             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
2951 
2952     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
2953 
2954     const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*buffer, 0ull, bufferSizeBytes);
2955     const VkDescriptorBufferInfo uniformBufferDescriptorInfo =
2956         makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
2957 
2958     DescriptorSetUpdateBuilder()
2959         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
2960                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
2961         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
2962                      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
2963         .update(vk, device);
2964 
2965     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
2966                                     m_context.getBinaryCollection().get("comp"));
2967     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
2968     pipeline.setPipelineCreateFlags(VK_PIPELINE_CREATE_DISPATCH_BASE);
2969 
2970 #ifndef CTS_USES_VULKANSC
2971     if (m_useMaintenance5)
2972     {
2973         VkPipelineCreateFlags2CreateInfoKHR pipelineFlags2CreateInfo = initVulkanStructure();
2974         pipelineFlags2CreateInfo.flags                               = VK_PIPELINE_CREATE_2_DISPATCH_BASE_BIT_KHR;
2975         pipeline.setPipelineCreatePNext(&pipelineFlags2CreateInfo);
2976         pipeline.setPipelineCreateFlags(0);
2977     }
2978 #else
2979     DE_UNREF(m_useMaintenance5);
2980 #endif // CTS_USES_VULKANSC
2981 
2982     pipeline.buildPipeline();
2983 
2984     const VkBufferMemoryBarrier hostWriteBarrier =
2985         makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2986     const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(
2987         VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
2988 
2989     const VkBufferMemoryBarrier shaderWriteBarrier =
2990         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer, 0ull, bufferSizeBytes);
2991 
2992     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
2993     const Unique<VkCommandBuffer> cmdBuffer(
2994         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
2995 
2996     // Start recording commands
2997     beginCommandBuffer(vk, *cmdBuffer);
2998 
2999     pipeline.bind(*cmdBuffer);
3000     vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
3001                              &descriptorSet.get(), 0u, DE_NULL);
3002 
3003     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
3004                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &hostUniformWriteBarrier, 0,
3005                           (const VkImageMemoryBarrier *)DE_NULL);
3006 
3007     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
3008                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &hostWriteBarrier, 0,
3009                           (const VkImageMemoryBarrier *)DE_NULL);
3010 
3011     // Split the workload across all physical devices based on m_splitWorkSize.x()
3012     for (uint32_t physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
3013     {
3014         uint32_t baseGroupX = physDevIdx * m_splitWorkSize.x();
3015         uint32_t baseGroupY = 0;
3016         uint32_t baseGroupZ = 0;
3017 
3018         // Split the workload within the physical device based on m_localSize.y() and m_localSize.z()
3019         for (int32_t localIdxY = 0; localIdxY < (m_splitWorkSize.y() / m_localSize.y()); localIdxY++)
3020         {
3021             for (int32_t localIdxZ = 0; localIdxZ < (m_splitWorkSize.z() / m_localSize.z()); localIdxZ++)
3022             {
3023                 uint32_t offsetX = baseGroupX;
3024                 uint32_t offsetY = baseGroupY + localIdxY * m_localSize.y();
3025                 uint32_t offsetZ = baseGroupZ + localIdxZ * m_localSize.z();
3026 
3027                 uint32_t localSizeX =
3028                     (physDevIdx == (m_numPhysDevices - 1)) ? m_workSize.x() - baseGroupX : m_localSize.x();
3029                 uint32_t localSizeY = m_localSize.y();
3030                 uint32_t localSizeZ = m_localSize.z();
3031 
3032                 totalWorkloadSize += (localSizeX * localSizeY * localSizeZ);
3033                 vk.cmdDispatchBase(*cmdBuffer, offsetX, offsetY, offsetZ, localSizeX, localSizeY, localSizeZ);
3034             }
3035         }
3036     }
3037 
3038     vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
3039                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &shaderWriteBarrier, 0,
3040                           (const VkImageMemoryBarrier *)DE_NULL);
3041 
3042     endCommandBuffer(vk, *cmdBuffer);
3043     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
3044 
3045     if (totalWorkloadSize != uint32_t(multiplyComponents(m_workSize)))
3046         TCU_THROW(TestError, "Not covering the entire workload.");
3047 
3048     // Validate the results
3049     const Allocation &bufferAllocation = buffer.getAllocation();
3050     invalidateAlloc(vk, device, bufferAllocation);
3051     const uint32_t *bufferPtr = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
3052 
3053     for (uint32_t ndx = 0; ndx < m_numValues; ++ndx)
3054     {
3055         const uint32_t res = bufferPtr[ndx];
3056         const uint32_t ref = ~inputData[ndx];
3057 
3058         if (res != ref)
3059         {
3060             std::ostringstream msg;
3061             msg << "Comparison failed for InOut.values[" << ndx << "]";
3062             return tcu::TestStatus::fail(msg.str());
3063         }
3064     }
3065     return tcu::TestStatus::pass("Compute succeeded");
3066 }
3067 
3068 class DeviceIndexTest : public vkt::TestCase
3069 {
3070 public:
3071     DeviceIndexTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
3072                     const tcu::IVec3 &localsize, const tcu::IVec3 &splitsize,
3073                     const vk::ComputePipelineConstructionType computePipelineConstructionType);
3074 
3075     virtual void checkSupport(Context &context) const;
3076     void initPrograms(SourceCollections &sourceCollections) const;
3077     TestInstance *createInstance(Context &context) const;
3078 
3079 private:
3080     const uint32_t m_numValues;
3081     const tcu::IVec3 m_localSize;
3082     const tcu::IVec3 m_workSize;
3083     const tcu::IVec3 m_splitSize;
3084     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3085 };
3086 
3087 class DeviceIndexTestInstance : public ComputeTestInstance
3088 {
3089 public:
3090     DeviceIndexTestInstance(Context &context, const uint32_t numValues, const tcu::IVec3 &localsize,
3091                             const tcu::IVec3 &worksize,
3092                             const vk::ComputePipelineConstructionType computePipelineConstructionType);
3093     tcu::TestStatus iterate(void);
3094 
3095 private:
3096     const uint32_t m_numValues;
3097     const tcu::IVec3 m_localSize;
3098     tcu::IVec3 m_workSize;
3099 };
3100 
DeviceIndexTest(tcu::TestContext & testCtx,const std::string & name,const uint32_t numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const vk::ComputePipelineConstructionType computePipelineConstructionType)3101 DeviceIndexTest::DeviceIndexTest(tcu::TestContext &testCtx, const std::string &name, const uint32_t numValues,
3102                                  const tcu::IVec3 &localsize, const tcu::IVec3 &worksize,
3103                                  const vk::ComputePipelineConstructionType computePipelineConstructionType)
3104     : TestCase(testCtx, name)
3105     , m_numValues(numValues)
3106     , m_localSize(localsize)
3107     , m_workSize(worksize)
3108     , m_computePipelineConstructionType(computePipelineConstructionType)
3109 {
3110 }
3111 
checkSupport(Context & context) const3112 void DeviceIndexTest::checkSupport(Context &context) const
3113 {
3114     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
3115                                   m_computePipelineConstructionType);
3116 }
3117 
initPrograms(SourceCollections & sourceCollections) const3118 void DeviceIndexTest::initPrograms(SourceCollections &sourceCollections) const
3119 {
3120     std::ostringstream src;
3121     src << "#version 310 es\n"
3122         << "#extension GL_EXT_device_group : require\n"
3123         << "layout (local_size_x = " << m_localSize.x() << ", local_size_y = " << m_localSize.y()
3124         << ", local_size_z = " << m_localSize.z() << ") in;\n"
3125 
3126         << "layout(binding = 0) buffer InOut {\n"
3127         << "    uint values[" << de::toString(m_numValues) << "];\n"
3128         << "} sb_inout;\n"
3129 
3130         << "layout(binding = 1) readonly uniform uniformInput {\n"
3131         << "    uint baseOffset[1+" << VK_MAX_DEVICE_GROUP_SIZE << "];\n"
3132         << "} ubo_in;\n"
3133 
3134         << "void main (void) {\n"
3135         << "    uvec3 size = gl_NumWorkGroups * gl_WorkGroupSize;\n"
3136         << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
3137         << "    uint index = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
3138            "gl_GlobalInvocationID.x;\n"
3139         << "    uint offset = numValuesPerInv*index;\n"
3140         << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
3141         << "        sb_inout.values[offset + ndx] = ubo_in.baseOffset[0] + ubo_in.baseOffset[gl_DeviceIndex + 1];\n"
3142         << "}\n";
3143 
3144     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
3145 }
3146 
createInstance(Context & context) const3147 TestInstance *DeviceIndexTest::createInstance(Context &context) const
3148 {
3149     return new DeviceIndexTestInstance(context, m_numValues, m_localSize, m_workSize,
3150                                        m_computePipelineConstructionType);
3151 }
3152 
DeviceIndexTestInstance(Context & context,const uint32_t numValues,const tcu::IVec3 & localsize,const tcu::IVec3 & worksize,const vk::ComputePipelineConstructionType computePipelineConstructionType)3153 DeviceIndexTestInstance::DeviceIndexTestInstance(
3154     Context &context, const uint32_t numValues, const tcu::IVec3 &localsize, const tcu::IVec3 &worksize,
3155     const vk::ComputePipelineConstructionType computePipelineConstructionType)
3156 
3157     : ComputeTestInstance(context, computePipelineConstructionType, false)
3158     , m_numValues(numValues)
3159     , m_localSize(localsize)
3160     , m_workSize(worksize)
3161 {
3162 }
3163 
iterate(void)3164 tcu::TestStatus DeviceIndexTestInstance::iterate(void)
3165 {
3166     const DeviceInterface &vk = getDeviceInterface();
3167     const VkDevice device     = getDevice();
3168     const VkQueue queue       = getDeviceQueue(vk, device, m_queueFamilyIndex, 0);
3169     SimpleAllocator allocator(vk, device,
3170                               getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice()));
3171     const uint32_t allocDeviceMask = (1 << m_numPhysDevices) - 1;
3172     de::Random rnd(0x82ce7f);
3173     Move<VkBuffer> sboBuffer;
3174     vk::Move<vk::VkDeviceMemory> sboBufferMemory;
3175 
3176     // Create an uniform and output buffer
3177     const uint32_t uniformBufSize             = 4 * (1 + VK_MAX_DEVICE_GROUP_SIZE);
3178     const VkDeviceSize uniformBufferSizeBytes = sizeof(uint32_t) * uniformBufSize;
3179     const BufferWithMemory uniformBuffer(
3180         vk, device, allocator, makeBufferCreateInfo(uniformBufferSizeBytes, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT),
3181         MemoryRequirement::HostVisible);
3182 
3183     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * m_numValues;
3184     const BufferWithMemory checkBuffer(vk, device, allocator,
3185                                        makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_TRANSFER_DST_BIT),
3186                                        MemoryRequirement::HostVisible);
3187 
3188     // create SBO buffer
3189     {
3190         const VkBufferCreateInfo sboBufferParams = {
3191             VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,                                  // sType
3192             DE_NULL,                                                               // pNext
3193             0u,                                                                    // flags
3194             (VkDeviceSize)bufferSizeBytes,                                         // size
3195             VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT, // usage
3196             VK_SHARING_MODE_EXCLUSIVE,                                             // sharingMode
3197             1u,                                                                    // queueFamilyIndexCount
3198             &m_queueFamilyIndex,                                                   // pQueueFamilyIndices
3199         };
3200         sboBuffer = createBuffer(vk, device, &sboBufferParams);
3201 
3202         VkMemoryRequirements memReqs = getBufferMemoryRequirements(vk, device, sboBuffer.get());
3203         uint32_t memoryTypeNdx       = 0;
3204         const VkPhysicalDeviceMemoryProperties deviceMemProps =
3205             getPhysicalDeviceMemoryProperties(m_context.getInstanceInterface(), getPhysicalDevice());
3206         for (memoryTypeNdx = 0; memoryTypeNdx < deviceMemProps.memoryTypeCount; memoryTypeNdx++)
3207         {
3208             if ((memReqs.memoryTypeBits & (1u << memoryTypeNdx)) != 0 &&
3209                 (deviceMemProps.memoryTypes[memoryTypeNdx].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) ==
3210                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
3211                 break;
3212         }
3213         if (memoryTypeNdx == deviceMemProps.memoryTypeCount)
3214             TCU_THROW(NotSupportedError, "No compatible memory type found");
3215 
3216         const VkMemoryAllocateFlagsInfo allocDeviceMaskInfo = {
3217             VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, // sType
3218             DE_NULL,                                      // pNext
3219             VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT,           // flags
3220             allocDeviceMask,                              // deviceMask
3221         };
3222 
3223         VkMemoryAllocateInfo allocInfo = {
3224             VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, // sType
3225             &allocDeviceMaskInfo,                   // pNext
3226             memReqs.size,                           // allocationSize
3227             memoryTypeNdx,                          // memoryTypeIndex
3228         };
3229 
3230         sboBufferMemory = allocateMemory(vk, device, &allocInfo);
3231         VK_CHECK(vk.bindBufferMemory(device, *sboBuffer, sboBufferMemory.get(), 0));
3232     }
3233 
3234     // Fill the buffers with data
3235     typedef std::vector<uint32_t> data_vector_t;
3236     data_vector_t uniformInputData(uniformBufSize, 0);
3237 
3238     {
3239         const Allocation &bufferAllocation = uniformBuffer.getAllocation();
3240         uint32_t *bufferPtr                = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
3241         for (uint32_t i = 0; i < uniformBufSize; ++i)
3242             uniformInputData[i] = *bufferPtr++ = rnd.getUint32() / 10; // divide to prevent overflow in addition
3243 
3244         flushAlloc(vk, device, bufferAllocation);
3245     }
3246 
3247     // Create descriptor set
3248     const Unique<VkDescriptorSetLayout> descriptorSetLayout(
3249         DescriptorSetLayoutBuilder()
3250             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3251             .addSingleBinding(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3252             .build(vk, device));
3253 
3254     const Unique<VkDescriptorPool> descriptorPool(
3255         DescriptorPoolBuilder()
3256             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3257             .addType(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER)
3258             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3259 
3260     const Unique<VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
3261 
3262     const VkDescriptorBufferInfo bufferDescriptorInfo = makeDescriptorBufferInfo(*sboBuffer, 0ull, bufferSizeBytes);
3263     const VkDescriptorBufferInfo uniformBufferDescriptorInfo =
3264         makeDescriptorBufferInfo(*uniformBuffer, 0ull, uniformBufferSizeBytes);
3265 
3266     DescriptorSetUpdateBuilder()
3267         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(0u),
3268                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo)
3269         .writeSingle(*descriptorSet, DescriptorSetUpdateBuilder::Location::binding(1u),
3270                      VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, &uniformBufferDescriptorInfo)
3271         .update(vk, device);
3272 
3273     ComputePipelineWrapper pipeline(vk, device, m_computePipelineConstructionType,
3274                                     m_context.getBinaryCollection().get("comp"));
3275     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
3276     pipeline.buildPipeline();
3277 
3278     const VkBufferMemoryBarrier hostUniformWriteBarrier = makeBufferMemoryBarrier(
3279         VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_UNIFORM_READ_BIT, *uniformBuffer, 0ull, uniformBufferSizeBytes);
3280     const VkBufferMemoryBarrier shaderWriteBarrier = makeBufferMemoryBarrier(
3281         VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT, *sboBuffer, 0ull, bufferSizeBytes);
3282 
3283     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, m_queueFamilyIndex));
3284     const Unique<VkCommandBuffer> cmdBuffer(
3285         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3286 
3287     // Verify multiple device masks
3288     for (uint32_t physDevMask = 1; physDevMask < (1u << m_numPhysDevices); physDevMask++)
3289     {
3290         uint32_t constantValPerLoop = 0;
3291         {
3292             const Allocation &bufferAllocation = uniformBuffer.getAllocation();
3293             uint32_t *bufferPtr                = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
3294             constantValPerLoop = *bufferPtr = rnd.getUint32() / 10; // divide to prevent overflow in addition
3295             flushAlloc(vk, device, bufferAllocation);
3296         }
3297         beginCommandBuffer(vk, *cmdBuffer);
3298 
3299         pipeline.bind(*cmdBuffer);
3300         vk.cmdBindDescriptorSets(*cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
3301                                  &descriptorSet.get(), 0u, DE_NULL);
3302         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
3303                               (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &hostUniformWriteBarrier, 0,
3304                               (const VkImageMemoryBarrier *)DE_NULL);
3305 
3306         vk.cmdSetDeviceMask(*cmdBuffer, physDevMask);
3307         vk.cmdDispatch(*cmdBuffer, m_workSize.x(), m_workSize.y(), m_workSize.z());
3308 
3309         vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
3310                               (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &shaderWriteBarrier, 0,
3311                               (const VkImageMemoryBarrier *)DE_NULL);
3312 
3313         endCommandBuffer(vk, *cmdBuffer);
3314         submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, physDevMask);
3315         m_context.resetCommandPoolForVKSC(device, *cmdPool);
3316 
3317         // Validate the results on all physical devices where compute shader was launched
3318         const VkBufferMemoryBarrier srcBufferBarrier = makeBufferMemoryBarrier(
3319             VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT, *sboBuffer, 0ull, bufferSizeBytes);
3320         const VkBufferMemoryBarrier dstBufferBarrier = makeBufferMemoryBarrier(
3321             VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *checkBuffer, 0ull, bufferSizeBytes);
3322         const VkBufferCopy copyParams = {
3323             (VkDeviceSize)0u, // srcOffset
3324             (VkDeviceSize)0u, // dstOffset
3325             bufferSizeBytes   // size
3326         };
3327 
3328         for (uint32_t physDevIdx = 0; physDevIdx < m_numPhysDevices; physDevIdx++)
3329         {
3330             if (!(1 << physDevIdx & physDevMask))
3331                 continue;
3332 
3333             const uint32_t deviceMask = 1 << physDevIdx;
3334 
3335             beginCommandBuffer(vk, *cmdBuffer);
3336             vk.cmdSetDeviceMask(*cmdBuffer, deviceMask);
3337             vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
3338                                   (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &srcBufferBarrier, 0,
3339                                   (const VkImageMemoryBarrier *)DE_NULL);
3340             vk.cmdCopyBuffer(*cmdBuffer, *sboBuffer, *checkBuffer, 1, &copyParams);
3341             vk.cmdPipelineBarrier(*cmdBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
3342                                   (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &dstBufferBarrier, 0,
3343                                   (const VkImageMemoryBarrier *)DE_NULL);
3344 
3345             endCommandBuffer(vk, *cmdBuffer);
3346             submitCommandsAndWait(vk, device, queue, *cmdBuffer, true, deviceMask);
3347 
3348             const Allocation &bufferAllocation = checkBuffer.getAllocation();
3349             invalidateAlloc(vk, device, bufferAllocation);
3350             const uint32_t *bufferPtr = static_cast<uint32_t *>(bufferAllocation.getHostPtr());
3351 
3352             for (uint32_t ndx = 0; ndx < m_numValues; ++ndx)
3353             {
3354                 const uint32_t res = bufferPtr[ndx];
3355                 const uint32_t ref = constantValPerLoop + uniformInputData[4 * (physDevIdx + 1)];
3356 
3357                 if (res != ref)
3358                 {
3359                     std::ostringstream msg;
3360                     msg << "Comparison failed on physical device " << getPhysicalDevice(physDevIdx) << " ( deviceMask "
3361                         << deviceMask << " ) for InOut.values[" << ndx << "]";
3362                     return tcu::TestStatus::fail(msg.str());
3363                 }
3364             }
3365         }
3366     }
3367 
3368     return tcu::TestStatus::pass("Compute succeeded");
3369 }
3370 
3371 class ConcurrentCompute : public vkt::TestCase
3372 {
3373 public:
3374     ConcurrentCompute(tcu::TestContext &testCtx, const std::string &name,
3375                       const vk::ComputePipelineConstructionType computePipelineConstructionType);
3376 
3377     virtual void checkSupport(Context &context) const;
3378     void initPrograms(SourceCollections &sourceCollections) const;
3379     TestInstance *createInstance(Context &context) const;
3380 
3381     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3382 };
3383 
3384 class ConcurrentComputeInstance : public vkt::TestInstance
3385 {
3386 public:
3387     ConcurrentComputeInstance(Context &context,
3388                               const vk::ComputePipelineConstructionType computePipelineConstructionType);
3389 
3390     tcu::TestStatus iterate(void);
3391 
3392 private:
3393     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3394 };
3395 
ConcurrentCompute(tcu::TestContext & testCtx,const std::string & name,const vk::ComputePipelineConstructionType computePipelineConstructionType)3396 ConcurrentCompute::ConcurrentCompute(tcu::TestContext &testCtx, const std::string &name,
3397                                      const vk::ComputePipelineConstructionType computePipelineConstructionType)
3398     : TestCase(testCtx, name)
3399     , m_computePipelineConstructionType(computePipelineConstructionType)
3400 {
3401 }
3402 
checkSupport(Context & context) const3403 void ConcurrentCompute::checkSupport(Context &context) const
3404 {
3405     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
3406                                   m_computePipelineConstructionType);
3407 }
3408 
initPrograms(SourceCollections & sourceCollections) const3409 void ConcurrentCompute::initPrograms(SourceCollections &sourceCollections) const
3410 {
3411     std::ostringstream src;
3412     src << "#version 310 es\n"
3413         << "layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
3414         << "layout(binding = 0) buffer InOut {\n"
3415         << "    uint values[1024];\n"
3416         << "} sb_inout;\n"
3417         << "void main (void) {\n"
3418         << "    uvec3 size           = gl_NumWorkGroups * gl_WorkGroupSize;\n"
3419         << "    uint numValuesPerInv = uint(sb_inout.values.length()) / (size.x*size.y*size.z);\n"
3420         << "    uint groupNdx        = size.x*size.y*gl_GlobalInvocationID.z + size.x*gl_GlobalInvocationID.y + "
3421            "gl_GlobalInvocationID.x;\n"
3422         << "    uint offset          = numValuesPerInv*groupNdx;\n"
3423         << "\n"
3424         << "    for (uint ndx = 0u; ndx < numValuesPerInv; ndx++)\n"
3425         << "        sb_inout.values[offset + ndx] = ~sb_inout.values[offset + ndx];\n"
3426         << "}\n";
3427 
3428     sourceCollections.glslSources.add("comp") << glu::ComputeSource(src.str());
3429 }
3430 
createInstance(Context & context) const3431 TestInstance *ConcurrentCompute::createInstance(Context &context) const
3432 {
3433     return new ConcurrentComputeInstance(context, m_computePipelineConstructionType);
3434 }
3435 
ConcurrentComputeInstance(Context & context,const vk::ComputePipelineConstructionType computePipelineConstructionType)3436 ConcurrentComputeInstance::ConcurrentComputeInstance(
3437     Context &context, const vk::ComputePipelineConstructionType computePipelineConstructionType)
3438     : TestInstance(context)
3439     , m_computePipelineConstructionType(computePipelineConstructionType)
3440 {
3441 }
3442 
iterate(void)3443 tcu::TestStatus ConcurrentComputeInstance::iterate(void)
3444 {
3445     enum
3446     {
3447         NO_MATCH_FOUND = ~((uint32_t)0),
3448         ERROR_NONE     = 0,
3449         ERROR_WAIT     = 1,
3450         ERROR_ORDER    = 2
3451     };
3452 
3453     struct Queues
3454     {
3455         VkQueue queue;
3456         uint32_t queueFamilyIndex;
3457     };
3458 
3459     // const DeviceInterface& vk = m_context.getDeviceInterface();
3460     const uint32_t numValues = 1024;
3461     const CustomInstance instance(createCustomInstanceFromContext(m_context));
3462     const InstanceDriver &instanceDriver(instance.getDriver());
3463     const VkPhysicalDevice physicalDevice =
3464         chooseDevice(instanceDriver, instance, m_context.getTestContext().getCommandLine());
3465     tcu::TestLog &log = m_context.getTestContext().getLog();
3466     vk::Move<vk::VkDevice> logicalDevice;
3467     std::vector<VkQueueFamilyProperties> queueFamilyProperties;
3468     VkDeviceCreateInfo deviceInfo;
3469     VkPhysicalDeviceFeatures2 deviceFeatures2 = initVulkanStructure();
3470     VkPhysicalDeviceFeatures deviceFeatures;
3471     const float queuePriorities[2] = {1.0f, 0.0f};
3472     VkDeviceQueueCreateInfo queueInfos[2];
3473     Queues queues[2] = {{DE_NULL, (uint32_t)NO_MATCH_FOUND}, {DE_NULL, (uint32_t)NO_MATCH_FOUND}};
3474 
3475     queueFamilyProperties = getPhysicalDeviceQueueFamilyProperties(instanceDriver, physicalDevice);
3476 
3477     for (uint32_t queueNdx = 0; queueNdx < queueFamilyProperties.size(); ++queueNdx)
3478     {
3479         if (queueFamilyProperties[queueNdx].queueFlags & VK_QUEUE_COMPUTE_BIT)
3480         {
3481             if (NO_MATCH_FOUND == queues[0].queueFamilyIndex)
3482                 queues[0].queueFamilyIndex = queueNdx;
3483 
3484             if (queues[0].queueFamilyIndex != queueNdx || queueFamilyProperties[queueNdx].queueCount > 1u)
3485             {
3486                 queues[1].queueFamilyIndex = queueNdx;
3487                 break;
3488             }
3489         }
3490     }
3491 
3492     if (queues[0].queueFamilyIndex == NO_MATCH_FOUND || queues[1].queueFamilyIndex == NO_MATCH_FOUND)
3493         TCU_THROW(NotSupportedError, "Queues couldn't be created");
3494 
3495     for (int queueNdx = 0; queueNdx < 2; ++queueNdx)
3496     {
3497         VkDeviceQueueCreateInfo queueInfo;
3498         deMemset(&queueInfo, 0, sizeof(queueInfo));
3499 
3500         queueInfo.sType            = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
3501         queueInfo.pNext            = DE_NULL;
3502         queueInfo.flags            = (VkDeviceQueueCreateFlags)0u;
3503         queueInfo.queueFamilyIndex = queues[queueNdx].queueFamilyIndex;
3504         queueInfo.queueCount       = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 2 : 1;
3505         queueInfo.pQueuePriorities = (queueInfo.queueCount == 2) ? queuePriorities : &queuePriorities[queueNdx];
3506 
3507         queueInfos[queueNdx] = queueInfo;
3508 
3509         if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3510             break;
3511     }
3512 
3513     void *pNext = DE_NULL;
3514 
3515     deMemset(&deviceInfo, 0, sizeof(deviceInfo));
3516     instanceDriver.getPhysicalDeviceFeatures(physicalDevice, &deviceFeatures);
3517 
3518     deviceFeatures2.features = deviceFeatures;
3519 
3520     std::vector<const char *> deviceExtensions;
3521 
3522 #ifndef CTS_USES_VULKANSC
3523     VkPhysicalDeviceDynamicRenderingFeaturesKHR dynamicRenderingFeatures = initVulkanStructure();
3524     dynamicRenderingFeatures.dynamicRendering                            = VK_TRUE;
3525     VkPhysicalDeviceShaderObjectFeaturesEXT shaderObjectFeatures = initVulkanStructure(&dynamicRenderingFeatures);
3526     shaderObjectFeatures.shaderObject                            = VK_TRUE;
3527 
3528     if (m_computePipelineConstructionType != COMPUTE_PIPELINE_CONSTRUCTION_TYPE_PIPELINE)
3529     {
3530         deviceExtensions.push_back("VK_EXT_shader_object");
3531         deviceFeatures2.pNext = &shaderObjectFeatures;
3532         pNext                 = &deviceFeatures2;
3533     }
3534 #endif
3535 
3536 #ifdef CTS_USES_VULKANSC
3537     VkDeviceObjectReservationCreateInfo memReservationInfo =
3538         m_context.getTestContext().getCommandLine().isSubProcess() ? m_context.getResourceInterface()->getStatMax() :
3539                                                                      resetDeviceObjectReservationCreateInfo();
3540     memReservationInfo.pNext = pNext;
3541     pNext                    = &memReservationInfo;
3542 
3543     VkPhysicalDeviceVulkanSC10Features sc10Features = createDefaultSC10Features();
3544     sc10Features.pNext                              = pNext;
3545     pNext                                           = &sc10Features;
3546 
3547     VkPipelineCacheCreateInfo pcCI;
3548     std::vector<VkPipelinePoolSize> poolSizes;
3549     if (m_context.getTestContext().getCommandLine().isSubProcess())
3550     {
3551         if (m_context.getResourceInterface()->getCacheDataSize() > 0)
3552         {
3553             pcCI = {
3554                 VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // VkStructureType sType;
3555                 DE_NULL,                                      // const void* pNext;
3556                 VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
3557                     VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT, // VkPipelineCacheCreateFlags flags;
3558                 m_context.getResourceInterface()->getCacheDataSize(),     // uintptr_t initialDataSize;
3559                 m_context.getResourceInterface()->getCacheData()          // const void* pInitialData;
3560             };
3561             memReservationInfo.pipelineCacheCreateInfoCount = 1;
3562             memReservationInfo.pPipelineCacheCreateInfos    = &pcCI;
3563         }
3564 
3565         poolSizes = m_context.getResourceInterface()->getPipelinePoolSizes();
3566         if (!poolSizes.empty())
3567         {
3568             memReservationInfo.pipelinePoolSizeCount = uint32_t(poolSizes.size());
3569             memReservationInfo.pPipelinePoolSizes    = poolSizes.data();
3570         }
3571     }
3572 #endif // CTS_USES_VULKANSC
3573 
3574     deviceInfo.sType                   = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
3575     deviceInfo.pNext                   = pNext;
3576     deviceInfo.enabledExtensionCount   = (uint32_t)deviceExtensions.size();
3577     deviceInfo.ppEnabledExtensionNames = deviceExtensions.data();
3578     deviceInfo.enabledLayerCount       = 0u;
3579     deviceInfo.ppEnabledLayerNames     = DE_NULL;
3580     deviceInfo.pEnabledFeatures        = (deviceFeatures2.pNext == DE_NULL) ? &deviceFeatures : DE_NULL;
3581     deviceInfo.queueCreateInfoCount    = (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex) ? 1 : 2;
3582     deviceInfo.pQueueCreateInfos       = queueInfos;
3583 
3584     logicalDevice =
3585         createCustomDevice(m_context.getTestContext().getCommandLine().isValidationEnabled(),
3586                            m_context.getPlatformInterface(), instance, instanceDriver, physicalDevice, &deviceInfo);
3587 
3588 #ifndef CTS_USES_VULKANSC
3589     de::MovePtr<vk::DeviceDriver> deviceDriver = de::MovePtr<DeviceDriver>(
3590         new DeviceDriver(m_context.getPlatformInterface(), instance, *logicalDevice, m_context.getUsedApiVersion(),
3591                          m_context.getTestContext().getCommandLine()));
3592 #else
3593     de::MovePtr<vk::DeviceDriverSC, vk::DeinitDeviceDeleter> deviceDriver =
3594         de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(
3595             new DeviceDriverSC(m_context.getPlatformInterface(), instance, *logicalDevice,
3596                                m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(),
3597                                m_context.getDeviceVulkanSC10Properties(), m_context.getDeviceProperties(),
3598                                m_context.getUsedApiVersion()),
3599             vk::DeinitDeviceDeleter(m_context.getResourceInterface().get(), *logicalDevice));
3600 #endif // CTS_USES_VULKANSC
3601     vk::DeviceInterface &vk = *deviceDriver;
3602 
3603     for (uint32_t queueReqNdx = 0; queueReqNdx < 2; ++queueReqNdx)
3604     {
3605         if (queues[0].queueFamilyIndex == queues[1].queueFamilyIndex)
3606             vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, queueReqNdx,
3607                               &queues[queueReqNdx].queue);
3608         else
3609             vk.getDeviceQueue(*logicalDevice, queues[queueReqNdx].queueFamilyIndex, 0u, &queues[queueReqNdx].queue);
3610     }
3611 
3612     // Create an input/output buffers
3613     const VkPhysicalDeviceMemoryProperties memoryProperties =
3614         vk::getPhysicalDeviceMemoryProperties(instanceDriver, physicalDevice);
3615 
3616     de::MovePtr<SimpleAllocator> allocator =
3617         de::MovePtr<SimpleAllocator>(new SimpleAllocator(vk, *logicalDevice, memoryProperties));
3618     const VkDeviceSize bufferSizeBytes = sizeof(uint32_t) * numValues;
3619     const BufferWithMemory buffer1(vk, *logicalDevice, *allocator,
3620                                    makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
3621                                    MemoryRequirement::HostVisible);
3622     const BufferWithMemory buffer2(vk, *logicalDevice, *allocator,
3623                                    makeBufferCreateInfo(bufferSizeBytes, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
3624                                    MemoryRequirement::HostVisible);
3625 
3626     // Fill the buffers with data
3627 
3628     typedef std::vector<uint32_t> data_vector_t;
3629     data_vector_t inputData(numValues);
3630 
3631     {
3632         de::Random rnd(0x82ce7f);
3633         const Allocation &bufferAllocation1 = buffer1.getAllocation();
3634         const Allocation &bufferAllocation2 = buffer2.getAllocation();
3635         uint32_t *bufferPtr1                = static_cast<uint32_t *>(bufferAllocation1.getHostPtr());
3636         uint32_t *bufferPtr2                = static_cast<uint32_t *>(bufferAllocation2.getHostPtr());
3637 
3638         for (uint32_t i = 0; i < numValues; ++i)
3639         {
3640             uint32_t val  = rnd.getUint32();
3641             inputData[i]  = val;
3642             *bufferPtr1++ = val;
3643             *bufferPtr2++ = val;
3644         }
3645 
3646         flushAlloc(vk, *logicalDevice, bufferAllocation1);
3647         flushAlloc(vk, *logicalDevice, bufferAllocation2);
3648     }
3649 
3650     // Create descriptor sets
3651 
3652     const Unique<VkDescriptorSetLayout> descriptorSetLayout1(
3653         DescriptorSetLayoutBuilder()
3654             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3655             .build(vk, *logicalDevice));
3656 
3657     const Unique<VkDescriptorPool> descriptorPool1(
3658         DescriptorPoolBuilder()
3659             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3660             .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3661 
3662     const Unique<VkDescriptorSet> descriptorSet1(
3663         makeDescriptorSet(vk, *logicalDevice, *descriptorPool1, *descriptorSetLayout1));
3664 
3665     const VkDescriptorBufferInfo bufferDescriptorInfo1 = makeDescriptorBufferInfo(*buffer1, 0ull, bufferSizeBytes);
3666     DescriptorSetUpdateBuilder()
3667         .writeSingle(*descriptorSet1, DescriptorSetUpdateBuilder::Location::binding(0u),
3668                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo1)
3669         .update(vk, *logicalDevice);
3670 
3671     const Unique<VkDescriptorSetLayout> descriptorSetLayout2(
3672         DescriptorSetLayoutBuilder()
3673             .addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT)
3674             .build(vk, *logicalDevice));
3675 
3676     const Unique<VkDescriptorPool> descriptorPool2(
3677         DescriptorPoolBuilder()
3678             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER)
3679             .build(vk, *logicalDevice, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
3680 
3681     const Unique<VkDescriptorSet> descriptorSet2(
3682         makeDescriptorSet(vk, *logicalDevice, *descriptorPool2, *descriptorSetLayout2));
3683 
3684     const VkDescriptorBufferInfo bufferDescriptorInfo2 = makeDescriptorBufferInfo(*buffer2, 0ull, bufferSizeBytes);
3685     DescriptorSetUpdateBuilder()
3686         .writeSingle(*descriptorSet2, DescriptorSetUpdateBuilder::Location::binding(0u),
3687                      VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptorInfo2)
3688         .update(vk, *logicalDevice);
3689 
3690     // Perform the computation
3691 
3692     const Unique<VkShaderModule> shaderModule(
3693         createShaderModule(vk, *logicalDevice, m_context.getBinaryCollection().get("comp"), 0u));
3694 
3695     ComputePipelineWrapper pipeline1(vk, *logicalDevice, m_computePipelineConstructionType,
3696                                      m_context.getBinaryCollection().get("comp"));
3697     pipeline1.setDescriptorSetLayout(*descriptorSetLayout1);
3698     pipeline1.buildPipeline();
3699     const VkBufferMemoryBarrier hostWriteBarrier1 =
3700         makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3701     const VkBufferMemoryBarrier shaderWriteBarrier1 =
3702         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer1, 0ull, bufferSizeBytes);
3703     const Unique<VkCommandPool> cmdPool1(makeCommandPool(vk, *logicalDevice, queues[0].queueFamilyIndex));
3704     const Unique<VkCommandBuffer> cmdBuffer1(
3705         allocateCommandBuffer(vk, *logicalDevice, *cmdPool1, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3706 
3707     ComputePipelineWrapper pipeline2(vk, *logicalDevice, m_computePipelineConstructionType,
3708                                      m_context.getBinaryCollection().get("comp"));
3709     pipeline2.setDescriptorSetLayout(*descriptorSetLayout2);
3710     pipeline2.buildPipeline();
3711     const VkBufferMemoryBarrier hostWriteBarrier2 =
3712         makeBufferMemoryBarrier(VK_ACCESS_HOST_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3713     const VkBufferMemoryBarrier shaderWriteBarrier2 =
3714         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, *buffer2, 0ull, bufferSizeBytes);
3715     const Unique<VkCommandPool> cmdPool2(makeCommandPool(vk, *logicalDevice, queues[1].queueFamilyIndex));
3716     const Unique<VkCommandBuffer> cmdBuffer2(
3717         allocateCommandBuffer(vk, *logicalDevice, *cmdPool2, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
3718 
3719     // Command buffer 1
3720 
3721     beginCommandBuffer(vk, *cmdBuffer1);
3722     pipeline1.bind(*cmdBuffer1);
3723     vk.cmdBindDescriptorSets(*cmdBuffer1, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline1.getPipelineLayout(), 0u, 1u,
3724                              &descriptorSet1.get(), 0u, DE_NULL);
3725     vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
3726                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &hostWriteBarrier1, 0,
3727                           (const VkImageMemoryBarrier *)DE_NULL);
3728     vk.cmdDispatch(*cmdBuffer1, 1, 1, 1);
3729     vk.cmdPipelineBarrier(*cmdBuffer1, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
3730                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &shaderWriteBarrier1, 0,
3731                           (const VkImageMemoryBarrier *)DE_NULL);
3732     endCommandBuffer(vk, *cmdBuffer1);
3733 
3734     // Command buffer 2
3735 
3736     beginCommandBuffer(vk, *cmdBuffer2);
3737     pipeline2.bind(*cmdBuffer2);
3738     vk.cmdBindDescriptorSets(*cmdBuffer2, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline2.getPipelineLayout(), 0u, 1u,
3739                              &descriptorSet2.get(), 0u, DE_NULL);
3740     vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
3741                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &hostWriteBarrier2, 0,
3742                           (const VkImageMemoryBarrier *)DE_NULL);
3743     vk.cmdDispatch(*cmdBuffer2, 1, 1, 1);
3744     vk.cmdPipelineBarrier(*cmdBuffer2, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT,
3745                           (VkDependencyFlags)0, 0, (const VkMemoryBarrier *)DE_NULL, 1, &shaderWriteBarrier2, 0,
3746                           (const VkImageMemoryBarrier *)DE_NULL);
3747     endCommandBuffer(vk, *cmdBuffer2);
3748 
3749     VkSubmitInfo submitInfo1 = {
3750         VK_STRUCTURE_TYPE_SUBMIT_INFO,         // sType
3751         DE_NULL,                               // pNext
3752         0u,                                    // waitSemaphoreCount
3753         DE_NULL,                               // pWaitSemaphores
3754         (const VkPipelineStageFlags *)DE_NULL, // pWaitDstStageMask
3755         1u,                                    // commandBufferCount
3756         &cmdBuffer1.get(),                     // pCommandBuffers
3757         0u,                                    // signalSemaphoreCount
3758         DE_NULL                                // pSignalSemaphores
3759     };
3760 
3761     VkSubmitInfo submitInfo2 = {
3762         VK_STRUCTURE_TYPE_SUBMIT_INFO,         // sType
3763         DE_NULL,                               // pNext
3764         0u,                                    // waitSemaphoreCount
3765         DE_NULL,                               // pWaitSemaphores
3766         (const VkPipelineStageFlags *)DE_NULL, // pWaitDstStageMask
3767         1u,                                    // commandBufferCount
3768         &cmdBuffer2.get(),                     // pCommandBuffers
3769         0u,                                    // signalSemaphoreCount
3770         DE_NULL                                // pSignalSemaphores
3771     };
3772 
3773     // Wait for completion
3774     const Unique<VkFence> fence1(createFence(vk, *logicalDevice));
3775     const Unique<VkFence> fence2(createFence(vk, *logicalDevice));
3776 
3777     VK_CHECK(vk.queueSubmit(queues[0].queue, 1u, &submitInfo1, *fence1));
3778     VK_CHECK(vk.queueSubmit(queues[1].queue, 1u, &submitInfo2, *fence2));
3779 
3780     int err = ERROR_NONE;
3781 
3782     // First wait for the low-priority queue
3783     if (VK_SUCCESS != vk.waitForFences(*logicalDevice, 1u, &fence2.get(), true, ~0ull))
3784         err = ERROR_WAIT;
3785 
3786     // If the high-priority queue hasn't finished, we have a problem.
3787     if (VK_SUCCESS != vk.getFenceStatus(*logicalDevice, fence1.get()))
3788         if (err == ERROR_NONE)
3789             err = ERROR_ORDER;
3790 
3791     // Wait for the high-priority fence so we don't get errors on teardown.
3792     vk.waitForFences(*logicalDevice, 1u, &fence1.get(), true, ~0ull);
3793 
3794     // If we fail() before waiting for all of the fences, error will come from
3795     // teardown instead of the error we want.
3796 
3797     if (err == ERROR_WAIT)
3798     {
3799         return tcu::TestStatus::fail("Failed waiting for low-priority queue fence.");
3800     }
3801 
3802     // Validate the results
3803 
3804     const Allocation &bufferAllocation1 = buffer1.getAllocation();
3805     invalidateAlloc(vk, *logicalDevice, bufferAllocation1);
3806     const uint32_t *bufferPtr1 = static_cast<uint32_t *>(bufferAllocation1.getHostPtr());
3807 
3808     const Allocation &bufferAllocation2 = buffer2.getAllocation();
3809     invalidateAlloc(vk, *logicalDevice, bufferAllocation2);
3810     const uint32_t *bufferPtr2 = static_cast<uint32_t *>(bufferAllocation2.getHostPtr());
3811 
3812     for (uint32_t ndx = 0; ndx < numValues; ++ndx)
3813     {
3814         const uint32_t res1 = bufferPtr1[ndx];
3815         const uint32_t res2 = bufferPtr2[ndx];
3816         const uint32_t inp  = inputData[ndx];
3817         const uint32_t ref  = ~inp;
3818 
3819         if (res1 != ref || res1 != res2)
3820         {
3821             std::ostringstream msg;
3822             msg << "Comparison failed for InOut.values[" << ndx << "] ref:" << ref << " res1:" << res1
3823                 << " res2:" << res2 << " inp:" << inp;
3824             return tcu::TestStatus::fail(msg.str());
3825         }
3826     }
3827 
3828     if (err == ERROR_ORDER)
3829         log << tcu::TestLog::Message
3830             << "Note: Low-priority queue was faster than high-priority one. This is not an error, but priorities may "
3831                "be inverted."
3832             << tcu::TestLog::EndMessage;
3833 
3834     return tcu::TestStatus::pass("Test passed");
3835 }
3836 
3837 class EmptyWorkGroupCase : public vkt::TestCase
3838 {
3839 public:
3840     EmptyWorkGroupCase(tcu::TestContext &testCtx, const std::string &name, const tcu::UVec3 &dispatchSize,
3841                        const vk::ComputePipelineConstructionType computePipelineConstructionType);
~EmptyWorkGroupCase(void)3842     virtual ~EmptyWorkGroupCase(void)
3843     {
3844     }
3845 
3846     virtual void checkSupport(Context &context) const override;
3847     TestInstance *createInstance(Context &context) const override;
3848     void initPrograms(vk::SourceCollections &programCollection) const override;
3849 
3850 protected:
3851     const tcu::UVec3 m_dispatchSize;
3852     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3853 };
3854 
3855 class EmptyWorkGroupInstance : public vkt::TestInstance
3856 {
3857 public:
EmptyWorkGroupInstance(Context & context,const tcu::UVec3 & dispatchSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)3858     EmptyWorkGroupInstance(Context &context, const tcu::UVec3 &dispatchSize,
3859                            const vk::ComputePipelineConstructionType computePipelineConstructionType)
3860         : vkt::TestInstance(context)
3861         , m_dispatchSize(dispatchSize)
3862         , m_computePipelineConstructionType(computePipelineConstructionType)
3863     {
3864     }
~EmptyWorkGroupInstance(void)3865     virtual ~EmptyWorkGroupInstance(void)
3866     {
3867     }
3868 
3869     tcu::TestStatus iterate(void) override;
3870 
3871 protected:
3872     const tcu::UVec3 m_dispatchSize;
3873     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
3874 };
3875 
EmptyWorkGroupCase(tcu::TestContext & testCtx,const std::string & name,const tcu::UVec3 & dispatchSize,const vk::ComputePipelineConstructionType computePipelineConstructionType)3876 EmptyWorkGroupCase::EmptyWorkGroupCase(tcu::TestContext &testCtx, const std::string &name,
3877                                        const tcu::UVec3 &dispatchSize,
3878                                        const vk::ComputePipelineConstructionType computePipelineConstructionType)
3879     : vkt::TestCase(testCtx, name)
3880     , m_dispatchSize(dispatchSize)
3881     , m_computePipelineConstructionType(computePipelineConstructionType)
3882 {
3883     DE_ASSERT(m_dispatchSize.x() == 0u || m_dispatchSize.y() == 0u || m_dispatchSize.z() == 0u);
3884 }
3885 
checkSupport(Context & context) const3886 void EmptyWorkGroupCase::checkSupport(Context &context) const
3887 {
3888     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
3889                                   m_computePipelineConstructionType);
3890 }
3891 
createInstance(Context & context) const3892 TestInstance *EmptyWorkGroupCase::createInstance(Context &context) const
3893 {
3894     return new EmptyWorkGroupInstance(context, m_dispatchSize, m_computePipelineConstructionType);
3895 }
3896 
initPrograms(vk::SourceCollections & programCollection) const3897 void EmptyWorkGroupCase::initPrograms(vk::SourceCollections &programCollection) const
3898 {
3899     std::ostringstream comp;
3900     comp << "#version 450\n"
3901          << "layout (local_size_x=1, local_size_y=1, local_size_z=1) in;\n"
3902          << "layout (set=0, binding=0) buffer VerificationBlock { uint value; } verif;\n"
3903          << "void main () { atomicAdd(verif.value, 1u); }\n";
3904     programCollection.glslSources.add("comp") << glu::ComputeSource(comp.str());
3905 }
3906 
iterate(void)3907 tcu::TestStatus EmptyWorkGroupInstance::iterate(void)
3908 {
3909     const auto &vkd       = m_context.getDeviceInterface();
3910     const auto device     = m_context.getDevice();
3911     auto &alloc           = m_context.getDefaultAllocator();
3912     const auto queueIndex = m_context.getUniversalQueueFamilyIndex();
3913     const auto queue      = m_context.getUniversalQueue();
3914 
3915     const auto verifBufferSize = static_cast<VkDeviceSize>(sizeof(uint32_t));
3916     const auto verifBufferInfo = makeBufferCreateInfo(verifBufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
3917     BufferWithMemory verifBuffer(vkd, device, alloc, verifBufferInfo, MemoryRequirement::HostVisible);
3918     auto &verifBufferAlloc = verifBuffer.getAllocation();
3919     void *verifBufferPtr   = verifBufferAlloc.getHostPtr();
3920 
3921     deMemset(verifBufferPtr, 0, static_cast<size_t>(verifBufferSize));
3922     flushAlloc(vkd, device, verifBufferAlloc);
3923 
3924     DescriptorSetLayoutBuilder layoutBuilder;
3925     layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT);
3926     const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
3927 
3928     ComputePipelineWrapper pipeline(vkd, device, m_computePipelineConstructionType,
3929                                     m_context.getBinaryCollection().get("comp"));
3930     pipeline.setDescriptorSetLayout(descriptorSetLayout.get());
3931     pipeline.buildPipeline();
3932 
3933     DescriptorPoolBuilder poolBuilder;
3934     poolBuilder.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
3935     const auto descriptorPool = poolBuilder.build(vkd, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
3936     const auto descriptorSet  = makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
3937 
3938     DescriptorSetUpdateBuilder updateBuilder;
3939     const auto verifBufferDescInfo = makeDescriptorBufferInfo(verifBuffer.get(), 0ull, verifBufferSize);
3940     updateBuilder.writeSingle(descriptorSet.get(), DescriptorSetUpdateBuilder::Location::binding(0u),
3941                               VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &verifBufferDescInfo);
3942     updateBuilder.update(vkd, device);
3943 
3944     const auto cmdPool      = makeCommandPool(vkd, device, queueIndex);
3945     const auto cmdBufferPtr = allocateCommandBuffer(vkd, device, cmdPool.get(), VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3946     const auto cmdBuffer    = cmdBufferPtr.get();
3947 
3948     beginCommandBuffer(vkd, cmdBuffer);
3949     pipeline.bind(cmdBuffer);
3950     vkd.cmdBindDescriptorSets(cmdBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline.getPipelineLayout(), 0u, 1u,
3951                               &descriptorSet.get(), 0u, nullptr);
3952     vkd.cmdDispatch(cmdBuffer, m_dispatchSize.x(), m_dispatchSize.y(), m_dispatchSize.z());
3953 
3954     const auto readWriteAccess  = (VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_SHADER_READ_BIT);
3955     const auto computeToCompute = makeMemoryBarrier(readWriteAccess, readWriteAccess);
3956     vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0U,
3957                            1u, &computeToCompute, 0u, nullptr, 0u, nullptr);
3958 
3959     vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
3960 
3961     const auto computeToHost = makeMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT);
3962     vkd.cmdPipelineBarrier(cmdBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_HOST_BIT, 0u, 1u,
3963                            &computeToHost, 0u, nullptr, 0u, nullptr);
3964 
3965     endCommandBuffer(vkd, cmdBuffer);
3966     submitCommandsAndWait(vkd, device, queue, cmdBuffer);
3967 
3968     uint32_t value;
3969     invalidateAlloc(vkd, device, verifBufferAlloc);
3970     deMemcpy(&value, verifBufferPtr, sizeof(value));
3971 
3972     if (value != 1u)
3973     {
3974         std::ostringstream msg;
3975         msg << "Unexpected value found in buffer: " << value << " while expecting 1";
3976         TCU_FAIL(msg.str());
3977     }
3978 
3979     return tcu::TestStatus::pass("Pass");
3980 }
3981 
3982 class MaxWorkGroupSizeTest : public vkt::TestCase
3983 {
3984 public:
3985     enum class Axis
3986     {
3987         X = 0,
3988         Y = 1,
3989         Z = 2
3990     };
3991 
3992     struct Params
3993     {
3994         // Which axis to maximize.
3995         Axis axis;
3996     };
3997 
3998     MaxWorkGroupSizeTest(tcu::TestContext &testCtx, const std::string &name, const Params &params,
3999                          const vk::ComputePipelineConstructionType computePipelineConstructionType);
~MaxWorkGroupSizeTest(void)4000     virtual ~MaxWorkGroupSizeTest(void)
4001     {
4002     }
4003 
4004     virtual void initPrograms(vk::SourceCollections &programCollection) const;
4005     virtual TestInstance *createInstance(Context &context) const;
4006     virtual void checkSupport(Context &context) const;
4007 
4008     // Helper to transform the axis value to an index.
4009     static int getIndex(Axis axis);
4010 
4011     // Helper returning the number of invocations according to the test parameters.
4012     static uint32_t getInvocations(const Params &params, const vk::InstanceInterface &vki,
4013                                    vk::VkPhysicalDevice physicalDevice,
4014                                    const vk::VkPhysicalDeviceProperties *devProperties = nullptr);
4015 
4016     // Helper returning the buffer size needed to this test.
4017     static uint32_t getSSBOSize(uint32_t invocations);
4018 
4019 private:
4020     Params m_params;
4021     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
4022 };
4023 
4024 class MaxWorkGroupSizeInstance : public vkt::TestInstance
4025 {
4026 public:
4027     MaxWorkGroupSizeInstance(Context &context, const MaxWorkGroupSizeTest::Params &params,
4028                              const vk::ComputePipelineConstructionType computePipelineConstructionType);
~MaxWorkGroupSizeInstance(void)4029     virtual ~MaxWorkGroupSizeInstance(void)
4030     {
4031     }
4032 
4033     virtual tcu::TestStatus iterate(void);
4034 
4035 private:
4036     MaxWorkGroupSizeTest::Params m_params;
4037     vk::ComputePipelineConstructionType m_computePipelineConstructionType;
4038 };
4039 
getIndex(Axis axis)4040 int MaxWorkGroupSizeTest::getIndex(Axis axis)
4041 {
4042     const int ret = static_cast<int>(axis);
4043     DE_ASSERT(ret >= static_cast<int>(Axis::X) && ret <= static_cast<int>(Axis::Z));
4044     return ret;
4045 }
4046 
getInvocations(const Params & params,const vk::InstanceInterface & vki,vk::VkPhysicalDevice physicalDevice,const vk::VkPhysicalDeviceProperties * devProperties)4047 uint32_t MaxWorkGroupSizeTest::getInvocations(const Params &params, const vk::InstanceInterface &vki,
4048                                               vk::VkPhysicalDevice physicalDevice,
4049                                               const vk::VkPhysicalDeviceProperties *devProperties)
4050 {
4051     const auto axis = getIndex(params.axis);
4052 
4053     if (devProperties)
4054         return devProperties->limits.maxComputeWorkGroupSize[axis];
4055     return vk::getPhysicalDeviceProperties(vki, physicalDevice).limits.maxComputeWorkGroupSize[axis];
4056 }
4057 
getSSBOSize(uint32_t invocations)4058 uint32_t MaxWorkGroupSizeTest::getSSBOSize(uint32_t invocations)
4059 {
4060     return invocations * static_cast<uint32_t>(sizeof(uint32_t));
4061 }
4062 
MaxWorkGroupSizeTest(tcu::TestContext & testCtx,const std::string & name,const Params & params,const vk::ComputePipelineConstructionType computePipelineConstructionType)4063 MaxWorkGroupSizeTest::MaxWorkGroupSizeTest(tcu::TestContext &testCtx, const std::string &name, const Params &params,
4064                                            const vk::ComputePipelineConstructionType computePipelineConstructionType)
4065     : vkt::TestCase(testCtx, name)
4066     , m_params(params)
4067     , m_computePipelineConstructionType(computePipelineConstructionType)
4068 {
4069 }
4070 
initPrograms(vk::SourceCollections & programCollection) const4071 void MaxWorkGroupSizeTest::initPrograms(vk::SourceCollections &programCollection) const
4072 {
4073     std::ostringstream shader;
4074 
4075     // The actual local sizes will be set using spec constants when running the test instance.
4076     shader << "#version 450\n"
4077            << "\n"
4078            << "layout(constant_id=0) const int local_size_x_val = 1;\n"
4079            << "layout(constant_id=1) const int local_size_y_val = 1;\n"
4080            << "layout(constant_id=2) const int local_size_z_val = 1;\n"
4081            << "\n"
4082            << "layout(local_size_x_id=0, local_size_y_id=1, local_size_z_id=2) in;\n"
4083            << "\n"
4084            << "layout(set=0, binding=0) buffer StorageBuffer {\n"
4085            << "    uint values[];\n"
4086            << "} ssbo;\n"
4087            << "\n"
4088            << "void main() {\n"
4089            << "    ssbo.values[gl_LocalInvocationIndex] = 1u;\n"
4090            << "}\n";
4091 
4092     programCollection.glslSources.add("comp") << glu::ComputeSource(shader.str());
4093 }
4094 
createInstance(Context & context) const4095 TestInstance *MaxWorkGroupSizeTest::createInstance(Context &context) const
4096 {
4097     return new MaxWorkGroupSizeInstance(context, m_params, m_computePipelineConstructionType);
4098 }
4099 
checkSupport(Context & context) const4100 void MaxWorkGroupSizeTest::checkSupport(Context &context) const
4101 {
4102     const auto &vki           = context.getInstanceInterface();
4103     const auto physicalDevice = context.getPhysicalDevice();
4104 
4105     const auto properties  = vk::getPhysicalDeviceProperties(vki, physicalDevice);
4106     const auto invocations = getInvocations(m_params, vki, physicalDevice, &properties);
4107 
4108     if (invocations > properties.limits.maxComputeWorkGroupInvocations)
4109         TCU_FAIL("Reported workgroup size limit in the axis is greater than the global invocation limit");
4110 
4111     if (properties.limits.maxStorageBufferRange / static_cast<uint32_t>(sizeof(uint32_t)) < invocations)
4112         TCU_THROW(NotSupportedError, "Maximum supported storage buffer range too small");
4113 
4114     checkShaderObjectRequirements(vki, physicalDevice, m_computePipelineConstructionType);
4115 }
4116 
MaxWorkGroupSizeInstance(Context & context,const MaxWorkGroupSizeTest::Params & params,const vk::ComputePipelineConstructionType computePipelineConstructionType)4117 MaxWorkGroupSizeInstance::MaxWorkGroupSizeInstance(
4118     Context &context, const MaxWorkGroupSizeTest::Params &params,
4119     const vk::ComputePipelineConstructionType computePipelineConstructionType)
4120     : vkt::TestInstance(context)
4121     , m_params(params)
4122     , m_computePipelineConstructionType(computePipelineConstructionType)
4123 {
4124 }
4125 
iterate(void)4126 tcu::TestStatus MaxWorkGroupSizeInstance::iterate(void)
4127 {
4128     const auto &vki           = m_context.getInstanceInterface();
4129     const auto &vkd           = m_context.getDeviceInterface();
4130     const auto physicalDevice = m_context.getPhysicalDevice();
4131     const auto device         = m_context.getDevice();
4132     auto &alloc               = m_context.getDefaultAllocator();
4133     const auto queueIndex     = m_context.getUniversalQueueFamilyIndex();
4134     const auto queue          = m_context.getUniversalQueue();
4135     auto &log                 = m_context.getTestContext().getLog();
4136 
4137     const auto axis        = MaxWorkGroupSizeTest::getIndex(m_params.axis);
4138     const auto invocations = MaxWorkGroupSizeTest::getInvocations(m_params, vki, physicalDevice);
4139     const auto ssboSize    = static_cast<vk::VkDeviceSize>(MaxWorkGroupSizeTest::getSSBOSize(invocations));
4140 
4141     log << tcu::TestLog::Message << "Running test with " << invocations << " invocations on axis " << axis
4142         << " using a storage buffer size of " << ssboSize << " bytes" << tcu::TestLog::EndMessage;
4143 
4144     // Main SSBO buffer.
4145     const auto ssboInfo = vk::makeBufferCreateInfo(ssboSize, vk::VK_BUFFER_USAGE_STORAGE_BUFFER_BIT);
4146     vk::BufferWithMemory ssbo(vkd, device, alloc, ssboInfo, vk::MemoryRequirement::HostVisible);
4147 
4148     // Descriptor set layouts.
4149     vk::DescriptorSetLayoutBuilder layoutBuilder;
4150     layoutBuilder.addSingleBinding(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, vk::VK_SHADER_STAGE_COMPUTE_BIT);
4151     const auto descriptorSetLayout = layoutBuilder.build(vkd, device);
4152 
4153     // Specialization constants: set the number of invocations in the appropriate local size id.
4154     const auto entrySize          = static_cast<uintptr_t>(sizeof(int32_t));
4155     int32_t specializationData[3] = {1, 1, 1};
4156     specializationData[axis]      = static_cast<int32_t>(invocations);
4157 
4158     const vk::VkSpecializationMapEntry specializationMaps[3] = {
4159         {
4160             0u,        // uint32_t constantID;
4161             0u,        // uint32_t offset;
4162             entrySize, // uintptr_t size;
4163         },
4164         {
4165             1u,                               // uint32_t constantID;
4166             static_cast<uint32_t>(entrySize), // uint32_t offset;
4167             entrySize,                        // uintptr_t size;
4168         },
4169         {
4170             2u,                                    // uint32_t constantID;
4171             static_cast<uint32_t>(entrySize * 2u), // uint32_t offset;
4172             entrySize,                             // uintptr_t size;
4173         },
4174     };
4175 
4176     const vk::VkSpecializationInfo specializationInfo = {
4177         3u,                                                 // uint32_t mapEntryCount;
4178         specializationMaps,                                 // const VkSpecializationMapEntry* pMapEntries;
4179         static_cast<uintptr_t>(sizeof(specializationData)), // uintptr_t dataSize;
4180         specializationData,                                 // const void* pData;
4181     };
4182 
4183     ComputePipelineWrapper testPipeline(vkd, device, m_computePipelineConstructionType,
4184                                         m_context.getBinaryCollection().get("comp"));
4185     testPipeline.setDescriptorSetLayout(descriptorSetLayout.get());
4186     testPipeline.setSpecializationInfo(specializationInfo);
4187     testPipeline.buildPipeline();
4188 
4189     // Create descriptor pool and set.
4190     vk::DescriptorPoolBuilder poolBuilder;
4191     poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
4192     const auto descriptorPool =
4193         poolBuilder.build(vkd, device, vk::VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u);
4194     const auto descriptorSet = vk::makeDescriptorSet(vkd, device, descriptorPool.get(), descriptorSetLayout.get());
4195 
4196     // Update descriptor set.
4197     const vk::VkDescriptorBufferInfo ssboBufferInfo = {
4198         ssbo.get(),    // VkBuffer buffer;
4199         0u,            // VkDeviceSize offset;
4200         VK_WHOLE_SIZE, // VkDeviceSize range;
4201     };
4202 
4203     vk::DescriptorSetUpdateBuilder updateBuilder;
4204     updateBuilder.writeSingle(descriptorSet.get(), vk::DescriptorSetUpdateBuilder::Location::binding(0u),
4205                               vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &ssboBufferInfo);
4206     updateBuilder.update(vkd, device);
4207 
4208     // Clear buffer.
4209     auto &ssboAlloc = ssbo.getAllocation();
4210     void *ssboPtr   = ssboAlloc.getHostPtr();
4211     deMemset(ssboPtr, 0, static_cast<size_t>(ssboSize));
4212     vk::flushAlloc(vkd, device, ssboAlloc);
4213 
4214     // Run pipelines.
4215     const auto cmdPool = vk::makeCommandPool(vkd, device, queueIndex);
4216     const auto cmdBUfferPtr =
4217         vk::allocateCommandBuffer(vkd, device, cmdPool.get(), vk::VK_COMMAND_BUFFER_LEVEL_PRIMARY);
4218     const auto cmdBuffer = cmdBUfferPtr.get();
4219 
4220     vk::beginCommandBuffer(vkd, cmdBuffer);
4221 
4222     // Run the main test shader.
4223     const auto hostToComputeBarrier = vk::makeBufferMemoryBarrier(
4224         vk::VK_ACCESS_HOST_WRITE_BIT, vk::VK_ACCESS_SHADER_WRITE_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
4225     vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_HOST_BIT, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0u, 0u,
4226                            nullptr, 1u, &hostToComputeBarrier, 0u, nullptr);
4227 
4228     testPipeline.bind(cmdBuffer);
4229     vkd.cmdBindDescriptorSets(cmdBuffer, vk::VK_PIPELINE_BIND_POINT_COMPUTE, testPipeline.getPipelineLayout(), 0u, 1u,
4230                               &descriptorSet.get(), 0u, nullptr);
4231     vkd.cmdDispatch(cmdBuffer, 1u, 1u, 1u);
4232 
4233     const auto computeToHostBarrier = vk::makeBufferMemoryBarrier(
4234         vk::VK_ACCESS_SHADER_WRITE_BIT, vk::VK_ACCESS_HOST_READ_BIT, ssbo.get(), 0ull, VK_WHOLE_SIZE);
4235     vkd.cmdPipelineBarrier(cmdBuffer, vk::VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, vk::VK_PIPELINE_STAGE_HOST_BIT, 0u, 0u,
4236                            nullptr, 1u, &computeToHostBarrier, 0u, nullptr);
4237 
4238     vk::endCommandBuffer(vkd, cmdBuffer);
4239     vk::submitCommandsAndWait(vkd, device, queue, cmdBuffer);
4240 
4241     // Verify buffer contents.
4242     vk::invalidateAlloc(vkd, device, ssboAlloc);
4243     std::unique_ptr<uint32_t[]> valuesArray(new uint32_t[invocations]);
4244     uint32_t *valuesPtr = valuesArray.get();
4245     deMemcpy(valuesPtr, ssboPtr, static_cast<size_t>(ssboSize));
4246 
4247     std::string errorMsg;
4248     bool ok = true;
4249 
4250     for (size_t i = 0; i < invocations; ++i)
4251     {
4252         if (valuesPtr[i] != 1u)
4253         {
4254             ok       = false;
4255             errorMsg = "Found invalid value for invocation index " + de::toString(i) + ": expected 1u and found " +
4256                        de::toString(valuesPtr[i]);
4257             break;
4258         }
4259     }
4260 
4261     if (!ok)
4262         return tcu::TestStatus::fail(errorMsg);
4263     return tcu::TestStatus::pass("Pass");
4264 }
4265 
4266 namespace EmptyShaderTest
4267 {
4268 
checkSupport(Context & context,vk::ComputePipelineConstructionType computePipelineConstructionType)4269 void checkSupport(Context &context, vk::ComputePipelineConstructionType computePipelineConstructionType)
4270 {
4271     checkShaderObjectRequirements(context.getInstanceInterface(), context.getPhysicalDevice(),
4272                                   computePipelineConstructionType);
4273 }
4274 
createProgram(SourceCollections & dst,vk::ComputePipelineConstructionType)4275 void createProgram(SourceCollections &dst, vk::ComputePipelineConstructionType)
4276 {
4277     dst.glslSources.add("comp") << glu::ComputeSource("#version 310 es\n"
4278                                                       "layout (local_size_x = 1) in;\n"
4279                                                       "void main (void) {}\n");
4280 }
4281 
createTest(Context & context,vk::ComputePipelineConstructionType computePipelineConstructionType)4282 tcu::TestStatus createTest(Context &context, vk::ComputePipelineConstructionType computePipelineConstructionType)
4283 {
4284     const DeviceInterface &vk       = context.getDeviceInterface();
4285     const VkDevice device           = context.getDevice();
4286     const VkQueue queue             = context.getUniversalQueue();
4287     const uint32_t queueFamilyIndex = context.getUniversalQueueFamilyIndex();
4288 
4289     ComputePipelineWrapper pipeline(vk, device, computePipelineConstructionType,
4290                                     context.getBinaryCollection().get("comp"));
4291     pipeline.buildPipeline();
4292 
4293     const Unique<VkCommandPool> cmdPool(makeCommandPool(vk, device, queueFamilyIndex));
4294     const Unique<VkCommandBuffer> cmdBuffer(
4295         allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
4296 
4297     // Start recording commands
4298 
4299     beginCommandBuffer(vk, *cmdBuffer);
4300 
4301     pipeline.bind(*cmdBuffer);
4302 
4303     const tcu::IVec3 workGroups(1, 1, 1);
4304     vk.cmdDispatch(*cmdBuffer, workGroups.x(), workGroups.y(), workGroups.z());
4305 
4306     endCommandBuffer(vk, *cmdBuffer);
4307 
4308     submitCommandsAndWait(vk, device, queue, *cmdBuffer);
4309 
4310     return tcu::TestStatus::pass("Compute succeeded");
4311 }
4312 
4313 } // namespace EmptyShaderTest
4314 
4315 namespace ComputeOnlyQueueTests
4316 {
4317 
getComputeOnlyQueueFamily(Context & context)4318 tcu::Maybe<uint32_t> getComputeOnlyQueueFamily(Context &context)
4319 {
4320     bool foundQueue = false;
4321     uint32_t index  = 0;
4322 
4323     auto queueFamilies =
4324         getPhysicalDeviceQueueFamilyProperties(context.getInstanceInterface(), context.getPhysicalDevice());
4325 
4326     for (const auto &queueFamily : queueFamilies)
4327     {
4328         if ((queueFamily.queueFlags & VK_QUEUE_COMPUTE_BIT) && !(queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT))
4329         {
4330             foundQueue = true;
4331             break;
4332         }
4333         else
4334         {
4335             index++;
4336         }
4337     }
4338     if (!foundQueue)
4339     {
4340         return tcu::Maybe<uint32_t>();
4341     }
4342     else
4343     {
4344         return index;
4345     }
4346 }
4347 
4348 // Creates a device that has a queue for compute capabilities without graphics.
createComputeOnlyDevice(vk::VkInstance instance,const InstanceInterface & instanceDriver,const VkPhysicalDevice physicalDevice,Context & context,uint32_t & queueFamilyIndex)4349 Move<VkDevice> createComputeOnlyDevice(vk::VkInstance instance, const InstanceInterface &instanceDriver,
4350                                        const VkPhysicalDevice physicalDevice, Context &context,
4351                                        uint32_t &queueFamilyIndex)
4352 {
4353     const auto queueFamilies = getPhysicalDeviceQueueFamilyProperties(instanceDriver, physicalDevice);
4354 
4355     // One queue family without a graphics bit should be found, since this is checked in checkSupport.
4356     queueFamilyIndex = getComputeOnlyQueueFamily(context).get();
4357 
4358     const float queuePriority                            = 1.0f;
4359     const VkDeviceQueueCreateInfo deviceQueueCreateInfos = {
4360         VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, // VkStructureType sType;
4361         nullptr,                                    // const void* pNext;
4362         (VkDeviceQueueCreateFlags)0u,               // VkDeviceQueueCreateFlags flags;
4363         queueFamilyIndex,                           // uint32_t queueFamilyIndex;
4364         1u,                                         // uint32_t queueCount;
4365         &queuePriority,                             // const float* pQueuePriorities;
4366     };
4367 
4368     void *pNext = nullptr;
4369 #ifdef CTS_USES_VULKANSC
4370     VkDeviceObjectReservationCreateInfo memReservationInfo = context.getTestContext().getCommandLine().isSubProcess() ?
4371                                                                  context.getResourceInterface()->getStatMax() :
4372                                                                  resetDeviceObjectReservationCreateInfo();
4373     pNext                                                  = &memReservationInfo;
4374 
4375     VkPhysicalDeviceVulkanSC10Features sc10Features = createDefaultSC10Features();
4376     sc10Features.pNext                              = pNext;
4377     pNext                                           = &sc10Features;
4378 
4379     VkPipelineCacheCreateInfo pcCI;
4380     std::vector<VkPipelinePoolSize> poolSizes;
4381     if (context.getTestContext().getCommandLine().isSubProcess())
4382     {
4383         if (context.getResourceInterface()->getCacheDataSize() > 0)
4384         {
4385             pcCI = {
4386                 VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // VkStructureType sType;
4387                 nullptr,                                      // const void* pNext;
4388                 VK_PIPELINE_CACHE_CREATE_READ_ONLY_BIT |
4389                     VK_PIPELINE_CACHE_CREATE_USE_APPLICATION_STORAGE_BIT, // VkPipelineCacheCreateFlags flags;
4390                 context.getResourceInterface()->getCacheDataSize(),       // uintptr_t initialDataSize;
4391                 context.getResourceInterface()->getCacheData()            // const void* pInitialData;
4392             };
4393             memReservationInfo.pipelineCacheCreateInfoCount = 1;
4394             memReservationInfo.pPipelineCacheCreateInfos    = &pcCI;
4395         }
4396         poolSizes = context.getResourceInterface()->getPipelinePoolSizes();
4397         if (!poolSizes.empty())
4398         {
4399             memReservationInfo.pipelinePoolSizeCount = uint32_t(poolSizes.size());
4400             memReservationInfo.pPipelinePoolSizes    = poolSizes.data();
4401         }
4402     }
4403 #endif // CTS_USES_VULKANSC
4404     const VkDeviceCreateInfo deviceCreateInfo = {
4405         VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // VkStructureType sType;
4406         pNext,                                // const void* pNext;
4407         (VkDeviceCreateFlags)0u,              // VkDeviceCreateFlags flags;
4408         1,                                    // uint32_t queueCreateInfoCount;
4409         &deviceQueueCreateInfos,              // const VkDeviceQueueCreateInfo* pQueueCreateInfos;
4410         0u,                                   // uint32_t enabledLayerCount;
4411         nullptr,                              // const char* const* ppEnabledLayerNames;
4412         0,                                    // uint32_t enabledExtensionCount;
4413         nullptr,                              // const char* const* ppEnabledExtensionNames;
4414         nullptr,                              // const VkPhysicalDeviceFeatures* pEnabledFeatures;
4415     };
4416 
4417     return vkt::createCustomDevice(context.getTestContext().getCommandLine().isValidationEnabled(),
4418                                    context.getPlatformInterface(), instance, instanceDriver, physicalDevice,
4419                                    &deviceCreateInfo);
4420 }
4421 
4422 class SecondaryCommandBufferComputeOnlyTest : public vkt::TestCase
4423 {
4424 public:
SecondaryCommandBufferComputeOnlyTest(tcu::TestContext & context,const std::string & name)4425     SecondaryCommandBufferComputeOnlyTest(tcu::TestContext &context, const std::string &name)
4426         : vkt::TestCase(context, name){};
4427 
4428     void initPrograms(SourceCollections &programCollection) const override;
4429     TestInstance *createInstance(Context &context) const override;
4430     void checkSupport(Context &context) const override;
4431 };
4432 
4433 class SecondaryCommandBufferComputeOnlyTestInstance : public vkt::TestInstance
4434 {
4435 public:
SecondaryCommandBufferComputeOnlyTestInstance(Context & context)4436     SecondaryCommandBufferComputeOnlyTestInstance(Context &context)
4437         : vkt::TestInstance(context)
4438 #ifdef CTS_USES_VULKANSC
4439         , m_customInstance(createCustomInstanceFromContext(context))
4440 #endif // CTS_USES_VULKANSC
4441               {};
4442     virtual tcu::TestStatus iterate(void);
4443 
4444 protected:
4445 #ifdef CTS_USES_VULKANSC
4446     const CustomInstance m_customInstance;
4447 #endif // CTS_USES_VULKANSC
4448 };
4449 
initPrograms(SourceCollections & collection) const4450 void SecondaryCommandBufferComputeOnlyTest::initPrograms(SourceCollections &collection) const
4451 {
4452     {
4453         std::ostringstream src;
4454         src << glu::getGLSLVersionDeclaration(glu::GLSL_VERSION_450) << "\n"
4455             << "layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;\n"
4456             << "layout(set = 0, binding = 0, std430) buffer Out\n"
4457             << "{\n"
4458             << "    uint data[];\n"
4459             << "};\n"
4460             << "void main (void)\n"
4461             << "{\n"
4462             << "data[0] = 1;"
4463             << "}\n";
4464         collection.glslSources.add("comp") << glu::ComputeSource(src.str());
4465     }
4466 }
4467 
createInstance(Context & context) const4468 TestInstance *SecondaryCommandBufferComputeOnlyTest::createInstance(Context &context) const
4469 {
4470     return new SecondaryCommandBufferComputeOnlyTestInstance(context);
4471 }
4472 
checkSupport(Context & context) const4473 void SecondaryCommandBufferComputeOnlyTest::checkSupport(Context &context) const
4474 {
4475     // Find at least one queue family that supports compute queue but does NOT support graphics queue.
4476     if (!getComputeOnlyQueueFamily(context))
4477         TCU_THROW(NotSupportedError, "No queue family found that only supports compute queue.");
4478 }
4479 
iterate()4480 tcu::TestStatus SecondaryCommandBufferComputeOnlyTestInstance::iterate()
4481 {
4482     VkDevice device;
4483     uint32_t queueFamilyIndex;
4484 #ifdef CTS_USES_VULKANSC
4485     const vk::InstanceInterface &vki = m_customInstance.getDriver();
4486     const VkPhysicalDevice physDevice =
4487         chooseDevice(vki, m_customInstance, m_context.getTestContext().getCommandLine());
4488     auto customDevice = createComputeOnlyDevice(m_customInstance, vki, physDevice, m_context, queueFamilyIndex);
4489     de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter> deviceDriver;
4490 #else
4491     const InstanceInterface &vki      = m_context.getInstanceInterface();
4492     const VkPhysicalDevice physDevice = m_context.getPhysicalDevice();
4493     auto customDevice = createComputeOnlyDevice(m_context.getInstance(), vki, physDevice, m_context, queueFamilyIndex);
4494     de::MovePtr<DeviceDriver> deviceDriver;
4495 #endif // CTS_USES_VULKANSC
4496 
4497     device = customDevice.get();
4498 
4499 #ifndef CTS_USES_VULKANSC
4500     deviceDriver = de::MovePtr<DeviceDriver>(new DeviceDriver(m_context.getPlatformInterface(), m_context.getInstance(),
4501                                                               device, m_context.getUsedApiVersion(),
4502                                                               m_context.getTestContext().getCommandLine()));
4503 #else
4504     deviceDriver = de::MovePtr<DeviceDriverSC, DeinitDeviceDeleter>(
4505         new DeviceDriverSC(m_context.getPlatformInterface(), m_customInstance, device,
4506                            m_context.getTestContext().getCommandLine(), m_context.getResourceInterface(),
4507                            m_context.getDeviceVulkanSC10Properties(), m_context.getDeviceProperties(),
4508                            m_context.getUsedApiVersion()),
4509         DeinitDeviceDeleter(m_context.getResourceInterface().get(), device));
4510 #endif // CTS_USES_VULKANSC
4511 
4512     const DeviceInterface &vkdi = *deviceDriver;
4513 
4514     auto queue = getDeviceQueue(vkdi, device, queueFamilyIndex, 0u);
4515     auto allocator =
4516         de::MovePtr<Allocator>(new SimpleAllocator(vkdi, device, getPhysicalDeviceMemoryProperties(vki, physDevice)));
4517 
4518     const auto bufferSize = static_cast<VkDeviceSize>(sizeof(uint32_t));
4519     BufferWithMemory buffer(vkdi, device, *allocator.get(),
4520                             makeBufferCreateInfo(bufferSize, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT),
4521                             MemoryRequirement::HostVisible);
4522     auto &bufferAlloc = buffer.getAllocation();
4523     void *bufferData  = bufferAlloc.getHostPtr();
4524     deMemset(bufferData, 0, sizeof(uint32_t));
4525     flushAlloc(vkdi, device, bufferAlloc);
4526 
4527     DescriptorSetLayoutBuilder layoutBuilder;
4528     layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_COMPUTE_BIT);
4529     Unique<VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vkdi, device));
4530 
4531     DescriptorPoolBuilder poolBuilder;
4532     poolBuilder.addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
4533     const auto descriptorPool = poolBuilder.build(vkdi, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1);
4534     const auto descriptorSetBuffer = makeDescriptorSet(vkdi, device, descriptorPool.get(), descriptorSetLayout.get());
4535 
4536     // Update descriptor sets.
4537     DescriptorSetUpdateBuilder updater;
4538 
4539     const auto bufferInfo = makeDescriptorBufferInfo(buffer.get(), 0ull, bufferSize);
4540     updater.writeSingle(descriptorSetBuffer.get(), DescriptorSetUpdateBuilder::Location::binding(0u),
4541                         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferInfo);
4542 
4543     updater.update(vkdi, device);
4544 
4545     auto shader = createShaderModule(vkdi, device, m_context.getBinaryCollection().get("comp"));
4546     // Create compute pipeline
4547     const Unique<VkPipelineLayout> pipelineLayout(makePipelineLayout(vkdi, device, *descriptorSetLayout));
4548     const Unique<VkPipeline> computePipeline(makeComputePipeline(vkdi, device, *pipelineLayout, *shader));
4549 
4550     // Create command buffer
4551     const Unique<VkCommandPool> cmdPool(makeCommandPool(vkdi, device, queueFamilyIndex));
4552     const Unique<VkCommandBuffer> cmdBuffer(
4553         allocateCommandBuffer(vkdi, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY));
4554     const Unique<VkCommandBuffer> cmdBuffer2(
4555         allocateCommandBuffer(vkdi, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_SECONDARY));
4556 
4557     const VkCommandBufferInheritanceInfo bufferInheritanceInfo{
4558         VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO, // VkStructureType sType;
4559         nullptr,                                           // const void* pNext;
4560         VK_NULL_HANDLE,                                    // VkRenderPass renderPass;
4561         0u,                                                // uint32_t subpass;
4562         VK_NULL_HANDLE,                                    // VkFramebuffer framebuffer;
4563         VK_FALSE,                                          // VkBool32 occlusionQueryEnable;
4564         (VkQueryControlFlags)0u,                           // VkQueryControlFlags queryFlags;
4565         (VkQueryPipelineStatisticFlags)0u                  // VkQueryPipelineStatisticFlags pipelineStatistics;
4566     };
4567 
4568     VkCommandBufferUsageFlags usageFlags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
4569     const VkCommandBufferBeginInfo commandBufBeginParams{
4570         VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, // VkStructureType sType;
4571         nullptr,                                     // const void* pNext;
4572         usageFlags,                                  // VkCommandBufferUsageFlags flags;
4573         &bufferInheritanceInfo};
4574 
4575     beginCommandBuffer(vkdi, cmdBuffer.get());
4576     vkdi.beginCommandBuffer(cmdBuffer2.get(), &commandBufBeginParams);
4577     vkdi.cmdBindPipeline(cmdBuffer2.get(), VK_PIPELINE_BIND_POINT_COMPUTE, computePipeline.get());
4578     vkdi.cmdBindDescriptorSets(cmdBuffer2.get(), VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout.get(), 0u, 1,
4579                                &descriptorSetBuffer.get(), 0u, nullptr);
4580     vkdi.cmdDispatch(cmdBuffer2.get(), 1, 1, 1);
4581     endCommandBuffer(vkdi, cmdBuffer2.get());
4582     vkdi.cmdExecuteCommands(cmdBuffer.get(), 1, &cmdBuffer2.get());
4583     const VkBufferMemoryBarrier renderBufferBarrier =
4584         makeBufferMemoryBarrier(VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_HOST_READ_BIT, buffer.get(), 0ull, bufferSize);
4585     cmdPipelineBufferMemoryBarrier(vkdi, cmdBuffer.get(), VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
4586                                    VK_PIPELINE_STAGE_HOST_BIT, &renderBufferBarrier);
4587     endCommandBuffer(vkdi, cmdBuffer.get());
4588     submitCommandsAndWait(vkdi, device, queue, cmdBuffer.get());
4589 
4590     invalidateAlloc(vkdi, device, bufferAlloc);
4591 
4592     uint32_t result = 0;
4593     deMemcpy(&result, bufferData, sizeof(uint32_t));
4594     if (result != 1)
4595     {
4596         return tcu::TestStatus::pass("value of buffer unexpected");
4597     }
4598 
4599     return tcu::TestStatus::pass("passed");
4600 }
4601 
4602 }; // namespace ComputeOnlyQueueTests
4603 
4604 } // namespace
4605 
createBasicComputeShaderTests(tcu::TestContext & testCtx,vk::ComputePipelineConstructionType computePipelineConstructionType)4606 tcu::TestCaseGroup *createBasicComputeShaderTests(tcu::TestContext &testCtx,
4607                                                   vk::ComputePipelineConstructionType computePipelineConstructionType)
4608 {
4609     // Basic compute tests
4610     de::MovePtr<tcu::TestCaseGroup> basicComputeTests(new tcu::TestCaseGroup(testCtx, "basic"));
4611 
4612     // Shader that does nothing
4613     addFunctionCaseWithPrograms(basicComputeTests.get(), "empty_shader", EmptyShaderTest::checkSupport,
4614                                 EmptyShaderTest::createProgram, EmptyShaderTest::createTest,
4615                                 computePipelineConstructionType);
4616 
4617     // Concurrent compute test
4618     basicComputeTests->addChild(new ConcurrentCompute(testCtx, "concurrent_compute", computePipelineConstructionType));
4619 
4620     // Use an empty workgroup with size 0 on the X axis
4621     basicComputeTests->addChild(
4622         new EmptyWorkGroupCase(testCtx, "empty_workgroup_x", tcu::UVec3(0u, 2u, 3u), computePipelineConstructionType));
4623     // Use an empty workgroup with size 0 on the Y axis
4624     basicComputeTests->addChild(
4625         new EmptyWorkGroupCase(testCtx, "empty_workgroup_y", tcu::UVec3(2u, 0u, 3u), computePipelineConstructionType));
4626     // Use an empty workgroup with size 0 on the Z axis
4627     basicComputeTests->addChild(
4628         new EmptyWorkGroupCase(testCtx, "empty_workgroup_z", tcu::UVec3(2u, 3u, 0u), computePipelineConstructionType));
4629     // Use an empty workgroup with size 0 on the X, Y and Z axes
4630     basicComputeTests->addChild(new EmptyWorkGroupCase(testCtx, "empty_workgroup_all", tcu::UVec3(0u, 0u, 0u),
4631                                                        computePipelineConstructionType));
4632 
4633     // Use the maximum work group size on the X axis
4634     basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_x",
4635                                                          MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::X},
4636                                                          computePipelineConstructionType));
4637     // Use the maximum work group size on the Y axis
4638     basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_y",
4639                                                          MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Y},
4640                                                          computePipelineConstructionType));
4641     // Use the maximum work group size on the Z axis
4642     basicComputeTests->addChild(new MaxWorkGroupSizeTest(testCtx, "max_local_size_z",
4643                                                          MaxWorkGroupSizeTest::Params{MaxWorkGroupSizeTest::Axis::Z},
4644                                                          computePipelineConstructionType));
4645 
4646     // Concurrent compute test
4647     basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(
4648         testCtx, "ubo_to_ssbo_single_invocation", 256, tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
4649         computePipelineConstructionType));
4650     basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_single_group", 1024,
4651                                                                               tcu::IVec3(2, 1, 4), tcu::IVec3(1, 1, 1),
4652                                                                               computePipelineConstructionType));
4653     basicComputeTests->addChild(BufferToBufferInvertTest::UBOToSSBOInvertCase(
4654         testCtx, "ubo_to_ssbo_multiple_invocations", 1024, tcu::IVec3(1, 1, 1), tcu::IVec3(2, 4, 1),
4655         computePipelineConstructionType));
4656     basicComputeTests->addChild(
4657         BufferToBufferInvertTest::UBOToSSBOInvertCase(testCtx, "ubo_to_ssbo_multiple_groups", 1024, tcu::IVec3(1, 4, 2),
4658                                                       tcu::IVec3(2, 2, 4), computePipelineConstructionType));
4659 
4660     // Concurrent compute test
4661     basicComputeTests->addChild(
4662         BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_single_invocation", 256, tcu::IVec3(1, 1, 1),
4663                                                      tcu::IVec3(1, 1, 1), computePipelineConstructionType));
4664     basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(
4665         testCtx, "copy_ssbo_multiple_invocations", 1024, tcu::IVec3(1, 1, 1), tcu::IVec3(2, 4, 1),
4666         computePipelineConstructionType));
4667     basicComputeTests->addChild(BufferToBufferInvertTest::CopyInvertSSBOCase(testCtx, "copy_ssbo_multiple_groups", 1024,
4668                                                                              tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4),
4669                                                                              computePipelineConstructionType));
4670 
4671     // Read and write same SSBO
4672     basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_rw_single_invocation", 256, true,
4673                                                           tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
4674                                                           computePipelineConstructionType));
4675     basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_rw_multiple_groups", 1024, true,
4676                                                           tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4),
4677                                                           computePipelineConstructionType));
4678     basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_unsized_arr_single_invocation", 256, false,
4679                                                           tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
4680                                                           computePipelineConstructionType));
4681     basicComputeTests->addChild(new InvertSSBOInPlaceTest(testCtx, "ssbo_unsized_arr_multiple_groups", 1024, false,
4682                                                           tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4),
4683                                                           computePipelineConstructionType));
4684 
4685     // Write to multiple SSBOs
4686     basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_arr_single_invocation", 256, true,
4687                                                             tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
4688                                                             computePipelineConstructionType));
4689     basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_arr_multiple_groups", 1024, true,
4690                                                             tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4),
4691                                                             computePipelineConstructionType));
4692     basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_unsized_arr_single_invocation",
4693                                                             256, false, tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
4694                                                             computePipelineConstructionType));
4695     basicComputeTests->addChild(new WriteToMultipleSSBOTest(testCtx, "write_multiple_unsized_arr_multiple_groups", 1024,
4696                                                             false, tcu::IVec3(1, 4, 2), tcu::IVec3(2, 2, 4),
4697                                                             computePipelineConstructionType));
4698 
4699     // SSBO local barrier usage
4700     basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_single_invocation",
4701                                                          tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
4702                                                          computePipelineConstructionType));
4703     basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_single_group",
4704                                                          tcu::IVec3(3, 2, 5), tcu::IVec3(1, 1, 1),
4705                                                          computePipelineConstructionType));
4706     basicComputeTests->addChild(new SSBOLocalBarrierTest(testCtx, "ssbo_local_barrier_multiple_groups",
4707                                                          tcu::IVec3(3, 4, 1), tcu::IVec3(2, 7, 3),
4708                                                          computePipelineConstructionType));
4709 
4710     // SSBO memory barrier usage
4711     basicComputeTests->addChild(
4712         new SSBOBarrierTest(testCtx, "ssbo_cmd_barrier_single", tcu::IVec3(1, 1, 1), computePipelineConstructionType));
4713     basicComputeTests->addChild(new SSBOBarrierTest(testCtx, "ssbo_cmd_barrier_multiple", tcu::IVec3(11, 5, 7),
4714                                                     computePipelineConstructionType));
4715 
4716     // Basic shared variable usage
4717     basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_single_invocation", tcu::IVec3(1, 1, 1),
4718                                                   tcu::IVec3(1, 1, 1), computePipelineConstructionType));
4719     basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_single_group", tcu::IVec3(3, 2, 5),
4720                                                   tcu::IVec3(1, 1, 1), computePipelineConstructionType));
4721     basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_multiple_invocations", tcu::IVec3(1, 1, 1),
4722                                                   tcu::IVec3(2, 5, 4), computePipelineConstructionType));
4723     basicComputeTests->addChild(new SharedVarTest(testCtx, "shared_var_multiple_groups", tcu::IVec3(3, 4, 1),
4724                                                   tcu::IVec3(2, 7, 3), computePipelineConstructionType));
4725 
4726     // Atomic operation with shared var
4727     basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_single_invocation",
4728                                                           tcu::IVec3(1, 1, 1), tcu::IVec3(1, 1, 1),
4729                                                           computePipelineConstructionType));
4730     basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_single_group", tcu::IVec3(3, 2, 5),
4731                                                           tcu::IVec3(1, 1, 1), computePipelineConstructionType));
4732     basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_multiple_invocations",
4733                                                           tcu::IVec3(1, 1, 1), tcu::IVec3(2, 5, 4),
4734                                                           computePipelineConstructionType));
4735     basicComputeTests->addChild(new SharedVarAtomicOpTest(testCtx, "shared_atomic_op_multiple_groups",
4736                                                           tcu::IVec3(3, 4, 1), tcu::IVec3(2, 7, 3),
4737                                                           computePipelineConstructionType));
4738 
4739     // Image to SSBO copy
4740     basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx, "copy_image_to_ssbo_small", tcu::IVec2(1, 1),
4741                                                         tcu::IVec2(64, 64), computePipelineConstructionType));
4742     basicComputeTests->addChild(new CopyImageToSSBOTest(testCtx, "copy_image_to_ssbo_large", tcu::IVec2(2, 4),
4743                                                         tcu::IVec2(512, 512), computePipelineConstructionType));
4744 
4745     // SSBO to image copy
4746     basicComputeTests->addChild(new CopySSBOToImageTest(testCtx, "copy_ssbo_to_image_small", tcu::IVec2(1, 1),
4747                                                         tcu::IVec2(64, 64), computePipelineConstructionType));
4748     basicComputeTests->addChild(new CopySSBOToImageTest(testCtx, "copy_ssbo_to_image_large", tcu::IVec2(2, 4),
4749                                                         tcu::IVec2(512, 512), computePipelineConstructionType));
4750 
4751     // Atomic operation with image
4752     basicComputeTests->addChild(new ImageAtomicOpTest(testCtx, "image_atomic_op_local_size_1", 1, tcu::IVec2(64, 64),
4753                                                       computePipelineConstructionType));
4754     basicComputeTests->addChild(new ImageAtomicOpTest(testCtx, "image_atomic_op_local_size_8", 8, tcu::IVec2(64, 64),
4755                                                       computePipelineConstructionType));
4756 
4757     // Image barrier
4758     basicComputeTests->addChild(
4759         new ImageBarrierTest(testCtx, "image_barrier_single", tcu::IVec2(1, 1), computePipelineConstructionType));
4760     basicComputeTests->addChild(
4761         new ImageBarrierTest(testCtx, "image_barrier_multiple", tcu::IVec2(64, 64), computePipelineConstructionType));
4762 
4763     // Test secondary command buffers in compute only queues
4764     basicComputeTests->addChild(
4765         new ComputeOnlyQueueTests::SecondaryCommandBufferComputeOnlyTest(testCtx, "secondary_compute_only_queue"));
4766 
4767 #ifndef CTS_USES_VULKANSC
4768     if (!isComputePipelineConstructionTypeShaderObject(computePipelineConstructionType))
4769     {
4770         basicComputeTests->addChild(
4771             cts_amber::createAmberTestCase(testCtx, "write_ssbo_array", "", "compute", "write_ssbo_array.amber"));
4772         basicComputeTests->addChild(
4773             cts_amber::createAmberTestCase(testCtx, "branch_past_barrier", "", "compute", "branch_past_barrier.amber"));
4774         basicComputeTests->addChild(cts_amber::createAmberTestCase(
4775             testCtx, "webgl_spirv_loop",
4776             "Simple SPIR-V loop from a WebGL example that caused problems in some implementations", "compute",
4777             "webgl_spirv_loop.amber"));
4778     }
4779 #endif
4780 
4781     return basicComputeTests.release();
4782 }
4783 
createBasicDeviceGroupComputeShaderTests(tcu::TestContext & testCtx,vk::ComputePipelineConstructionType computePipelineConstructionType)4784 tcu::TestCaseGroup *createBasicDeviceGroupComputeShaderTests(
4785     tcu::TestContext &testCtx, vk::ComputePipelineConstructionType computePipelineConstructionType)
4786 {
4787     de::MovePtr<tcu::TestCaseGroup> deviceGroupComputeTests(new tcu::TestCaseGroup(testCtx, "device_group"));
4788 
4789     deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx, "dispatch_base", 32768, tcu::IVec3(4, 2, 4),
4790                                                            tcu::IVec3(16, 8, 8), tcu::IVec3(4, 8, 8),
4791                                                            computePipelineConstructionType, false));
4792 #ifndef CTS_USES_VULKANSC
4793     deviceGroupComputeTests->addChild(new DispatchBaseTest(testCtx, "dispatch_base_maintenance5", 32768,
4794                                                            tcu::IVec3(4, 2, 4), tcu::IVec3(16, 8, 8),
4795                                                            tcu::IVec3(4, 8, 8), computePipelineConstructionType, true));
4796 #endif
4797     deviceGroupComputeTests->addChild(new DeviceIndexTest(testCtx, "device_index", 96, tcu::IVec3(3, 2, 1),
4798                                                           tcu::IVec3(2, 4, 1), computePipelineConstructionType));
4799 
4800     return deviceGroupComputeTests.release();
4801 }
4802 } // namespace compute
4803 } // namespace vkt
4804