xref: /aosp_15_r20/external/deqp/external/vulkancts/modules/vulkan/reconvergence/vktReconvergenceTests.cpp (revision 35238bce31c2a825756842865a792f8cf7f89930)
1 /*------------------------------------------------------------------------
2  * Vulkan Conformance Tests
3  * ------------------------
4  *
5  * Copyright (c) 2019 The Khronos Group Inc.
6  * Copyright (c) 2018-2020 NVIDIA Corporation
7  *
8  * Licensed under the Apache License, Version 2.0 (the "Licensehelper
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  *
20  * \file
21  * \brief Vulkan Reconvergence tests
22  *//*--------------------------------------------------------------------*/
23 
24 #include "vktReconvergenceTests.hpp"
25 
26 #include "vkBufferWithMemory.hpp"
27 #include "vkImageWithMemory.hpp"
28 #include "vkQueryUtil.hpp"
29 #include "vkBuilderUtil.hpp"
30 #include "vkCmdUtil.hpp"
31 #include "vkTypeUtil.hpp"
32 #include "vkObjUtil.hpp"
33 
34 #include "vktTestGroupUtil.hpp"
35 #include "vktTestCase.hpp"
36 #include "vktAmberTestCase.hpp"
37 
38 #include "deDefs.h"
39 #include "deFloat16.h"
40 #include "deMath.h"
41 #include "deRandom.h"
42 #include "deSharedPtr.hpp"
43 #include "deString.h"
44 
45 #include "tcuTestCase.hpp"
46 #include "tcuTestLog.hpp"
47 
48 #include <array>
49 #include <bitset>
50 #include <functional>
51 #include <map>
52 #include <numeric>
53 #include <random>
54 #include <string>
55 #include <sstream>
56 #include <set>
57 #include <type_traits>
58 #include <vector>
59 #include <memory>
60 #include <cmath>
61 #include <initializer_list>
62 
63 #include <iostream>
64 
65 // #define INCLUDE_GRAPHICS_TESTS
66 
67 namespace vkt
68 {
69 namespace Reconvergence
70 {
71 namespace
72 {
73 using namespace vk;
74 using namespace std;
75 
76 #define ARRAYSIZE(x) (sizeof(x) / sizeof(x[0]))
77 #define ROUNDUP(x__, multipler__) ((((x__) + ((multipler__)-1)) / (multipler__)) * (multipler__))
78 #define ROUNDDOWN(x__, multipler__) (((x__) / (multipler__)) * (multipler__))
79 constexpr uint32_t MAX_INVOCATIONS_ALL_TESTS = 64 * 64;
80 typedef std::bitset<MAX_INVOCATIONS_ALL_TESTS> bitset_inv_t;
81 //constexpr bitset_inv_t MAGIC_BALLOT = 0x12345678;
82 
83 typedef enum
84 {
85     TT_SUCF_ELECT,  // subgroup_uniform_control_flow using elect (subgroup_basic)
86     TT_SUCF_BALLOT, // subgroup_uniform_control_flow using ballot (subgroup_ballot)
87     TT_WUCF_ELECT,  // workgroup uniform control flow using elect (subgroup_basic)
88     TT_WUCF_BALLOT, // workgroup uniform control flow using ballot (subgroup_ballot)
89     TT_MAXIMAL,     // maximal reconvergence
90 } TestType;
91 
92 static_assert(VK_TRUE == 1, "VK_TRUE must equal 1");
93 
94 struct CaseDef
95 {
96     VkShaderStageFlagBits shaderStage;
97     TestType testType;
98     uint32_t maxNesting;
99     uint32_t seed;
100     // In the case of compute shader below sizes would be local_size_x and local_size_y respectively.
101     // In the case of fragment shader these sizes would define framebuffer dimensions.
102     uint32_t sizeX;
103     uint32_t sizeY;
104 
isWUCFvkt::Reconvergence::__anon4f2394780111::CaseDef105     bool isWUCF() const
106     {
107         return testType == TT_WUCF_ELECT || testType == TT_WUCF_BALLOT;
108     }
isSUCFvkt::Reconvergence::__anon4f2394780111::CaseDef109     bool isSUCF() const
110     {
111         return testType == TT_SUCF_ELECT || testType == TT_SUCF_BALLOT;
112     }
isUCFvkt::Reconvergence::__anon4f2394780111::CaseDef113     bool isUCF() const
114     {
115         return isWUCF() || isSUCF();
116     }
isElectvkt::Reconvergence::__anon4f2394780111::CaseDef117     bool isElect() const
118     {
119         return testType == TT_WUCF_ELECT || testType == TT_SUCF_ELECT;
120     }
121 
verifyvkt::Reconvergence::__anon4f2394780111::CaseDef122     bool verify() const
123     {
124         return (sizeX * sizeY) <= MAX_INVOCATIONS_ALL_TESTS;
125     }
126 };
127 
128 template <class T, class P = T (*)[1], class R = decltype(std::begin(*std::declval<P>()))>
makeStdBeginEnd(void * p,uint32_t n)129 static auto makeStdBeginEnd(void *p, uint32_t n) -> std::pair<R, R>
130 {
131     auto tmp   = std::begin(*P(p));
132     auto begin = tmp;
133     std::advance(tmp, n);
134     return {begin, tmp};
135 }
136 
137 template <class R>
138 using add_ref = typename std::add_lvalue_reference<R>::type;
139 template <class R>
140 using add_cref = typename std::add_lvalue_reference<typename std::add_const<R>::type>::type;
141 template <class X>
142 using add_ptr = std::add_pointer_t<X>;
143 template <class X>
144 using add_cptr = std::add_pointer_t<std::add_const_t<X>>;
145 
146 template <class RndIter>
max_element(RndIter first,RndIter last)147 RndIter max_element(RndIter first, RndIter last)
148 {
149     RndIter max = last;
150     if (first != last)
151     {
152         for (max = first, ++first; first != last; ++first)
153         {
154             if (*first > *max)
155                 max = first;
156         }
157     }
158     return max;
159 }
160 
161 template <class RndIter, class Selector>
max_element(RndIter first,RndIter last,Selector selector)162 RndIter max_element(RndIter first, RndIter last, Selector selector)
163 {
164     RndIter max = last;
165     if (first != last)
166     {
167         for (max = first, ++first; first != last; ++first)
168         {
169             if (selector(*first) > selector(*max))
170                 max = first;
171         }
172     }
173     return max;
174 }
175 
176 struct Ballot : public std::bitset<128>
177 {
178     typedef std::bitset<128> super;
Ballotvkt::Reconvergence::__anon4f2394780111::Ballot179     Ballot() : super()
180     {
181     }
Ballotvkt::Reconvergence::__anon4f2394780111::Ballot182     Ballot(add_cref<super> ballot, uint32_t printbits = 128u) : super(ballot), m_bits(printbits)
183     {
184     }
Ballotvkt::Reconvergence::__anon4f2394780111::Ballot185     Ballot(add_cref<tcu::UVec4> ballot, uint32_t printbits = 128u) : super(), m_bits(printbits)
186     {
187         *this = ballot;
188     }
Ballotvkt::Reconvergence::__anon4f2394780111::Ballot189     Ballot(uint64_t val, uint32_t printbits = 128u) : super(val), m_bits(printbits)
190     {
191     }
withSetBitvkt::Reconvergence::__anon4f2394780111::Ballot192     static Ballot withSetBit(uint32_t bit)
193     {
194         Ballot b;
195         b.set(bit);
196         return b;
197     }
sizevkt::Reconvergence::__anon4f2394780111::Ballot198     constexpr uint32_t size() const
199     {
200         return static_cast<uint32_t>(super::size());
201     }
operator tcu::UVec4vkt::Reconvergence::__anon4f2394780111::Ballot202     operator tcu::UVec4() const
203     {
204         tcu::UVec4 result;
205         super ballot(*this);
206         const super mask = 0xFFFFFFFF;
207         for (uint32_t k = 0; k < 4u; ++k)
208         {
209             result[k] = uint32_t((ballot & mask).to_ulong());
210             ballot >>= 32;
211         }
212         return result;
213     }
operator =vkt::Reconvergence::__anon4f2394780111::Ballot214     add_ref<Ballot> operator=(add_cref<tcu::UVec4> vec)
215     {
216         for (uint32_t k = 0; k < 4u; ++k)
217         {
218             (*this) <<= 32;
219             (*this) |= vec[3 - k];
220         }
221         return *this;
222     }
getwvkt::Reconvergence::__anon4f2394780111::Ballot223     DE_UNUSED_FUNCTION uint32_t getw() const
224     {
225         return m_bits;
226     }
setwvkt::Reconvergence::__anon4f2394780111::Ballot227     DE_UNUSED_FUNCTION void setw(uint32_t bits)
228     {
229         m_bits = bits;
230     }
operator <<(add_ref<std::ostream> str,add_cref<Ballot> ballot)231     DE_UNUSED_FUNCTION friend add_ref<std::ostream> operator<<(add_ref<std::ostream> str, add_cref<Ballot> ballot)
232     {
233         for (uint32_t i = 0u; i < ballot.m_bits && i < 128u; ++i)
234         {
235             str << (ballot[ballot.m_bits - i - 1u] ? '1' : '0');
236         }
237         return str;
238     }
239 
240 protected:
241     uint32_t m_bits;
242 };
243 
244 struct Ballots : protected std::vector<std::bitset<128>>
245 {
246     typedef std::vector<value_type> super;
247     static const constexpr uint32_t subgroupInvocationSize = static_cast<uint32_t>(value_type().size());
Ballotsvkt::Reconvergence::__anon4f2394780111::Ballots248     Ballots() : super()
249     {
250     }
Ballotsvkt::Reconvergence::__anon4f2394780111::Ballots251     explicit Ballots(uint32_t subgroupCount, add_cref<value_type> ballot = {}) : super(subgroupCount)
252     {
253         if (ballot.any())
254             *this = ballot;
255     }
Ballotsvkt::Reconvergence::__anon4f2394780111::Ballots256     Ballots(add_cref<Ballots> other) : super(upcast(other))
257     {
258     }
Ballotsvkt::Reconvergence::__anon4f2394780111::Ballots259     Ballots(Ballots &&other) : super(std::move(other))
260     {
261     }
262     using super::operator[];
263     using super::at;
264     /**
265      * @brief size method
266      * @return Returns the number of bits that the Ballots holds.
267      */
sizevkt::Reconvergence::__anon4f2394780111::Ballots268     uint32_t size() const
269     {
270         return static_cast<uint32_t>(super::size() * subgroupInvocationSize);
271     }
272     /**
273      * @brief count method
274      * @return Returns the number of bits that are set to true.
275      */
countvkt::Reconvergence::__anon4f2394780111::Ballots276     uint32_t count() const
277     {
278         uint32_t n = 0u;
279         for (add_cref<value_type> b : *this)
280             n += static_cast<uint32_t>(b.count());
281         return n;
282     }
283     /**
284      * @brief count method
285      * @return Returns the number of bits that are set to true in given subgroup.
286      */
countvkt::Reconvergence::__anon4f2394780111::Ballots287     uint32_t count(uint32_t subgroup) const
288     {
289         DE_ASSERT(subgroup < subgroupCount());
290         return static_cast<uint32_t>(at(subgroup).count());
291     }
subgroupCountvkt::Reconvergence::__anon4f2394780111::Ballots292     uint32_t subgroupCount() const
293     {
294         return static_cast<uint32_t>(super::size());
295     }
testvkt::Reconvergence::__anon4f2394780111::Ballots296     bool test(uint32_t bit) const
297     {
298         DE_ASSERT(bit < size());
299         return at(bit / subgroupInvocationSize).test(bit % subgroupInvocationSize);
300     }
setvkt::Reconvergence::__anon4f2394780111::Ballots301     bool set(uint32_t bit, bool value = true)
302     {
303         DE_ASSERT(bit <= size());
304         const bool before = test(bit);
305         at(bit / subgroupInvocationSize).set((bit % subgroupInvocationSize), value);
306         return before;
307     }
fullvkt::Reconvergence::__anon4f2394780111::Ballots308     void full()
309     {
310         const uint32_t bb = size();
311         for (uint32_t b = 0u; b < bb; ++b)
312             set(b);
313     }
setnvkt::Reconvergence::__anon4f2394780111::Ballots314     add_ref<Ballots> setn(uint32_t bits)
315     {
316         for (uint32_t i = 0u; i < bits; ++i)
317             set(i);
318         return *this;
319     }
allvkt::Reconvergence::__anon4f2394780111::Ballots320     bool all() const
321     {
322         const uint32_t gg = subgroupCount();
323         for (uint32_t g = 0u; g < gg; ++g)
324         {
325             if (false == at(g).all())
326                 return false;
327         }
328         return (gg != 0u);
329     }
nonevkt::Reconvergence::__anon4f2394780111::Ballots330     bool none() const
331     {
332         const uint32_t gg = subgroupCount();
333         for (uint32_t g = 0u; g < gg; ++g)
334         {
335             if (false == at(g).none())
336                 return false;
337         }
338         return (gg != 0u);
339     }
anyvkt::Reconvergence::__anon4f2394780111::Ballots340     bool any() const
341     {
342         bool res          = false;
343         const uint32_t gg = subgroupCount();
344         for (uint32_t g = 0u; g < gg; ++g)
345             res |= super::at(g).any();
346         return res;
347     }
findBitvkt::Reconvergence::__anon4f2394780111::Ballots348     static uint32_t findBit(uint32_t otherFullyQualifiedInvocationID, uint32_t otherSubgroupSize)
349     {
350         return (((otherFullyQualifiedInvocationID / otherSubgroupSize) * subgroupInvocationSize) +
351                 (otherFullyQualifiedInvocationID % otherSubgroupSize));
352     }
upcastvkt::Reconvergence::__anon4f2394780111::Ballots353     inline add_cref<super> upcast(add_cref<Ballots> other) const
354     {
355         return static_cast<add_cref<super>>(other);
356     }
operator &=vkt::Reconvergence::__anon4f2394780111::Ballots357     add_ref<Ballots> operator&=(add_cref<Ballots> other)
358     {
359         DE_ASSERT(subgroupCount() == other.subgroupCount());
360         const uint32_t gg = subgroupCount();
361         for (uint32_t g = 0u; g < gg; ++g)
362             super::at(g) = super::at(g) & upcast(other).at(g);
363         return *this;
364     }
operator &vkt::Reconvergence::__anon4f2394780111::Ballots365     Ballots operator&(add_cref<Ballots> other) const
366     {
367         Ballots res(*this);
368         res &= other;
369         return res;
370     }
operator |=vkt::Reconvergence::__anon4f2394780111::Ballots371     add_ref<Ballots> operator|=(add_cref<Ballots> other)
372     {
373         DE_ASSERT(subgroupCount() == other.subgroupCount());
374         const uint32_t gg = subgroupCount();
375         for (uint32_t g = 0u; g < gg; ++g)
376             super::at(g) = super::at(g) | upcast(other).at(g);
377         return *this;
378     }
operator |vkt::Reconvergence::__anon4f2394780111::Ballots379     Ballots operator|(add_cref<Ballots> other) const
380     {
381         Ballots res(*this);
382         res |= other;
383         return res;
384     }
operator <<=vkt::Reconvergence::__anon4f2394780111::Ballots385     add_ref<Ballots> operator<<=(uint32_t bits)
386     {
387         return ((*this) = ((*this) << bits));
388     }
operator <<vkt::Reconvergence::__anon4f2394780111::Ballots389     Ballots operator<<(uint32_t bits) const
390     {
391         Ballots res(subgroupCount());
392         if (bits < size() && bits != 0u)
393         {
394             for (uint32_t b = 0; b < bits; ++b)
395                 res.set((b + bits), test(b));
396         }
397         return res;
398     }
operator ~vkt::Reconvergence::__anon4f2394780111::Ballots399     Ballots operator~() const
400     {
401         Ballots res(*this);
402         const uint32_t gg = subgroupCount();
403         for (uint32_t g = 0u; g < gg; ++g)
404             res.at(g) = super::at(g).operator~();
405         return res;
406     }
operator ==vkt::Reconvergence::__anon4f2394780111::Ballots407     bool operator==(add_cref<Ballots> other) const
408     {
409         if (super::size() == upcast(other).size())
410         {
411             const uint32_t gg = subgroupCount();
412             for (uint32_t g = 0u; g < gg; ++g)
413             {
414                 if (at(g) != other[g])
415                     return false;
416             }
417             return true;
418         }
419         return false;
420     }
operator =vkt::Reconvergence::__anon4f2394780111::Ballots421     add_ref<Ballots> operator=(add_cref<Ballots> other)
422     {
423         DE_ASSERT((subgroupCount() == other.subgroupCount()));
424         const uint32_t gg = subgroupCount();
425         for (uint32_t g = 0u; g < gg; ++g)
426             at(g) = other.at(g);
427         return *this;
428     }
operator =vkt::Reconvergence::__anon4f2394780111::Ballots429     add_ref<Ballots> operator=(add_cref<value_type> forAllGroups)
430     {
431         DE_ASSERT(super::size() >= 1u);
432         const uint32_t gg = subgroupCount();
433         for (uint32_t g = 0u; g < gg; ++g)
434             at(g) = forAllGroups;
435         return *this;
436     }
437 };
438 
subgroupSizeToMask(uint32_t subgroupSize)439 uint64_t subgroupSizeToMask(uint32_t subgroupSize)
440 {
441     if (subgroupSize == 64)
442         return ~0ULL;
443     else
444         return (1ULL << subgroupSize) - 1;
445 }
446 
subgroupSizeToMask(uint32_t subgroupSize,uint32_t subgroupCount)447 Ballot subgroupSizeToMask(uint32_t subgroupSize, uint32_t subgroupCount)
448 {
449     DE_UNREF(subgroupCount);
450     Ballot b;
451     DE_ASSERT(subgroupSize <= b.size());
452     for (uint32_t i = 0; i < subgroupSize; ++i)
453         b.set(i);
454     return b;
455 }
456 
457 // Take a 64-bit integer, mask it to the subgroup size, and then
458 // replicate it for each subgroup
bitsetFromU64(uint64_t mask,uint32_t subgroupSize)459 bitset_inv_t bitsetFromU64(uint64_t mask, uint32_t subgroupSize)
460 {
461     mask &= subgroupSizeToMask(subgroupSize);
462     bitset_inv_t result(mask);
463     for (uint32_t i = 0; i < result.size() / subgroupSize - 1; ++i)
464     {
465         result = (result << subgroupSize) | bitset_inv_t(mask);
466     }
467     return result;
468 }
469 
ballotsFromU64(uint64_t maskValue,uint32_t subgroupSize,uint32_t subgroupCount)470 Ballots ballotsFromU64(uint64_t maskValue, uint32_t subgroupSize, uint32_t subgroupCount)
471 {
472     Ballot b(maskValue);
473     b &= subgroupSizeToMask(subgroupSize, subgroupCount);
474     Ballots result(subgroupCount);
475     for (uint32_t g = 0; g < subgroupCount; ++g)
476         result.at(g) = b;
477     return result;
478 }
479 
ballotsFromBallot(Ballot b,uint32_t subgroupSize,uint32_t subgroupCount)480 Ballots ballotsFromBallot(Ballot b, uint32_t subgroupSize, uint32_t subgroupCount)
481 {
482     b &= subgroupSizeToMask(subgroupSize, subgroupCount);
483     Ballots result(subgroupCount);
484     for (uint32_t g = 0; g < subgroupCount; ++g)
485         result.at(g) = b;
486     return result;
487 }
488 
489 // Pick out the mask for the subgroup that invocationID is a member of
bitsetToU64(const bitset_inv_t & bitset,uint32_t subgroupSize,uint32_t invocationID)490 uint64_t bitsetToU64(const bitset_inv_t &bitset, uint32_t subgroupSize, uint32_t invocationID)
491 {
492     bitset_inv_t copy(bitset);
493     copy >>= (invocationID / subgroupSize) * subgroupSize;
494     copy &= bitset_inv_t(subgroupSizeToMask(subgroupSize));
495     uint64_t mask = copy.to_ullong();
496     mask &= subgroupSizeToMask(subgroupSize);
497     return mask;
498 }
499 
500 // Pick out the mask for the subgroup that invocationID is a member of
bitsetToBallot(const Ballots & bitset,uint32_t subgroupSize,uint32_t invocationID)501 Ballot bitsetToBallot(const Ballots &bitset, uint32_t subgroupSize, uint32_t invocationID)
502 {
503     return bitset.at(invocationID / subgroupSize) & subgroupSizeToMask(subgroupSize, bitset.subgroupCount());
504 }
505 
506 // Pick out the mask for the subgroup that invocationID is a member of
bitsetToBallot(add_cref<Ballots> bitset,add_cref<Ballot> subgroupSizeMask,uint32_t subgroupSize,uint32_t invocationID)507 Ballot bitsetToBallot(add_cref<Ballots> bitset, add_cref<Ballot> subgroupSizeMask, uint32_t subgroupSize,
508                       uint32_t invocationID)
509 {
510     return bitset.at(invocationID / subgroupSize) & subgroupSizeMask;
511 }
512 
bitsetToBallot(uint64_t value,uint32_t subgroupCount,uint32_t subgroupSize,uint32_t invocationID)513 Ballot bitsetToBallot(uint64_t value, uint32_t subgroupCount, uint32_t subgroupSize, uint32_t invocationID)
514 {
515     Ballots bs = ballotsFromU64(value, subgroupSize, subgroupCount);
516     return bitsetToBallot(bs, subgroupSize, invocationID);
517 }
518 
findLSB(uint64_t value)519 static int findLSB(uint64_t value)
520 {
521     for (int i = 0; i < 64; i++)
522     {
523         if (value & (1ULL << i))
524             return i;
525     }
526     return -1;
527 }
528 
529 template <uint32_t N>
findLSB(add_cref<std::bitset<N>> value)530 static uint32_t findLSB(add_cref<std::bitset<N>> value)
531 {
532     for (uint32_t i = 0u; i < N; ++i)
533     {
534         if (value.test(i))
535             return i;
536     }
537     return std::numeric_limits<uint32_t>::max();
538 }
539 
540 // For each subgroup, pick out the elected invocationID, and accumulate
541 // a bitset of all of them
bitsetElect(const bitset_inv_t & value,int32_t subgroupSize)542 static bitset_inv_t bitsetElect(const bitset_inv_t &value, int32_t subgroupSize)
543 {
544     bitset_inv_t ret; // zero initialized
545 
546     for (int32_t i = 0; i < (int32_t)value.size(); i += subgroupSize)
547     {
548         uint64_t mask = bitsetToU64(value, subgroupSize, i);
549         int lsb       = findLSB(mask);
550         ret |= bitset_inv_t(lsb == -1 ? 0 : (1ULL << lsb)) << i;
551     }
552     return ret;
553 }
554 
bitsetElect(add_cref<Ballots> value)555 static Ballots bitsetElect(add_cref<Ballots> value)
556 {
557     Ballots ret(value.subgroupCount());
558     for (uint32_t g = 0u; g < value.subgroupCount(); ++g)
559     {
560         const uint32_t lsb = findLSB<Ballots::subgroupInvocationSize>(value.at(g));
561         if (lsb != std::numeric_limits<uint32_t>::max())
562         {
563             ret.at(g).set(lsb);
564         }
565     }
566     return ret;
567 }
568 
569 struct PushConstant
570 {
571     int32_t invocationStride;
572     uint32_t width;
573     uint32_t height;
574     uint32_t primitiveStride;
575     uint32_t subgroupStride;
576     uint32_t enableInvocationIndex;
577 };
578 
579 struct Vertex
580 {
581     // Traditional POD structure that mimics a vertex.
582     // Be carefull before do any changes in this structure
583     // because it is strictly mapped to VK_FORMAT_R32G32B32A32_SFLOAT
584     // when graphics pipeline is constructed.
585     float x, y, z, w;
586 };
587 
588 typedef Vertex Triangle[3];
589 
590 class RandomProgram;
591 class ComputeRandomProgram;
592 
getSubgroupProperties(vkt::Context & context)593 std::pair<vk::VkPhysicalDeviceSubgroupProperties, vk::VkPhysicalDeviceProperties2> getSubgroupProperties(
594     vkt::Context &context)
595 {
596     vk::VkPhysicalDeviceSubgroupProperties subgroupProperties;
597     deMemset(&subgroupProperties, 0, sizeof(subgroupProperties));
598     subgroupProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
599 
600     vk::VkPhysicalDeviceProperties2 properties2;
601     deMemset(&properties2, 0, sizeof(properties2));
602     properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
603     properties2.pNext = &subgroupProperties;
604 
605     context.getInstanceInterface().getPhysicalDeviceProperties2(context.getPhysicalDevice(), &properties2);
606 
607     return {subgroupProperties, properties2};
608 }
609 
610 class ReconvergenceTestInstance : public TestInstance
611 {
612 public:
613     // { vert, frag, tesc, tese, geom }; if any
614     using Shaders = std::vector<Move<VkShaderModule>>;
615 
ReconvergenceTestInstance(Context & context,const CaseDef & data)616     ReconvergenceTestInstance(Context &context, const CaseDef &data)
617         : TestInstance(context)
618         , m_data(data)
619         , m_subgroupSize(getSubgroupProperties(context).first.subgroupSize)
620     {
621     }
622     ~ReconvergenceTestInstance(void) = default;
623 
624     Move<VkPipeline> createComputePipeline(const VkPipelineLayout pipelineLayout, const VkShaderModule computeShader);
625     Move<VkPipeline> createGraphicsPipeline(const VkPipelineLayout pipelineLayout, const VkRenderPass renderPass,
626                                             const uint32_t width, const uint32_t height, const Shaders &shaders,
627                                             const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
628                                             const uint32_t patchControlPoints  = 0u);
629 
630 protected:
631     const CaseDef m_data;
632     const uint32_t m_subgroupSize;
633 };
634 
635 class ReconvergenceTestComputeInstance : public ReconvergenceTestInstance
636 {
637 public:
ReconvergenceTestComputeInstance(Context & context,const CaseDef & data,std::shared_ptr<RandomProgram> program,std::map<uint32_t,uint32_t> && subgroupSizeToMaxLoc)638     ReconvergenceTestComputeInstance(Context &context, const CaseDef &data, std::shared_ptr<RandomProgram> program,
639                                      std::map<uint32_t, uint32_t> &&subgroupSizeToMaxLoc)
640         : ReconvergenceTestInstance(context, data)
641         , m_program(std::static_pointer_cast<ComputeRandomProgram>(program))
642         , m_subgroupSizeToMaxLoc(std::move(subgroupSizeToMaxLoc))
643     {
644     }
645     ~ReconvergenceTestComputeInstance(void) = default;
646 
647     virtual tcu::TestStatus iterate(void) override;
648     qpTestResult_e calculateAndLogResult(const tcu::UVec4 *result, const std::vector<tcu::UVec4> &ref,
649                                          uint32_t invocationStride, uint32_t subgroupSize, uint32_t shaderMaxLoc);
650 
651 private:
652     std::shared_ptr<ComputeRandomProgram> m_program;
653     std::map<uint32_t, uint32_t> m_subgroupSizeToMaxLoc;
654 };
655 
656 class ReconvergenceTestGraphicsInstance : public ReconvergenceTestInstance
657 {
658 public:
ReconvergenceTestGraphicsInstance(Context & context,const CaseDef & data)659     ReconvergenceTestGraphicsInstance(Context &context, const CaseDef &data) : ReconvergenceTestInstance(context, data)
660     {
661     }
662     ~ReconvergenceTestGraphicsInstance(void) = default;
663 
664     auto makeRenderPassBeginInfo(const VkRenderPass renderPass, const VkFramebuffer framebuffer)
665         -> VkRenderPassBeginInfo;
666     virtual auto recordDrawingAndSubmit(const VkCommandBuffer cmdBuffer, const VkPipelineLayout pipelineLayout,
667                                         const VkPipeline pipeline, const VkDescriptorSet descriptorSet,
668                                         const PushConstant &pushConstant, const VkRenderPassBeginInfo &renderPassInfo,
669                                         const VkBuffer vertexBuffer, const uint32_t vertexCount, const VkImage image)
670         -> void;
671     virtual auto generateVertices(const uint32_t primitiveCount, const VkPrimitiveTopology topology,
672                                   const uint32_t patchSize = 1) -> std::vector<tcu::Vec4>;
673     virtual auto createVertexBufferAndFlush(const std::vector<tcu::Vec4> &vertices) -> de::MovePtr<BufferWithMemory>;
674     virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
675         -> de::MovePtr<BufferWithMemory>;
676     virtual auto createShaders(void) -> Shaders = 0;
677 
678     enum PrintMode
679     {
680         None,
681         ThreadsInColumns,
682         OutLocsInColumns,
683         IntuitiveThreadsOutlocs,
684         Console
685     };
686 
687     virtual auto calculateAndLogResult(const uint64_t *result, const std::vector<uint64_t> &ref,
688                                        uint32_t invocationStride, uint32_t subgroupSize, uint32_t shaderMaxLocs,
689                                        uint32_t primitiveCount, PrintMode printMode) -> qpTestResult_e;
690 };
691 
692 class ReconvergenceTestFragmentInstance : public ReconvergenceTestGraphicsInstance
693 {
694     struct Arrangement
695     {
696     };
697     friend class FragmentRandomProgram;
698 
699 public:
ReconvergenceTestFragmentInstance(Context & context,const CaseDef & data)700     ReconvergenceTestFragmentInstance(Context &context, const CaseDef &data)
701         : ReconvergenceTestGraphicsInstance(context, data)
702     {
703     }
704     ~ReconvergenceTestFragmentInstance(void) = default;
705     virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
706     auto callAuxiliaryShader(tcu::TestStatus &status, uint32_t triangleCount) -> std::vector<uint32_t>;
707     auto makeImageCreateInfo(VkFormat format) const -> VkImageCreateInfo;
708     virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
709         -> de::MovePtr<BufferWithMemory> override;
710     virtual auto iterate(void) -> tcu::TestStatus override;
711     auto calculateAndLogResultEx(tcu::TestLog &log, const tcu::UVec4 *result, const std::vector<tcu::UVec4> &ref,
712                                  const uint32_t maxLoc, const Arrangement &a, const PrintMode printMode)
713         -> qpTestResult_e;
714 };
715 
716 class ReconvergenceTestVertexInstance : public ReconvergenceTestGraphicsInstance
717 {
718 public:
ReconvergenceTestVertexInstance(Context & context,const CaseDef & data)719     ReconvergenceTestVertexInstance(Context &context, const CaseDef &data)
720         : ReconvergenceTestGraphicsInstance(context, data)
721     {
722     }
723     ~ReconvergenceTestVertexInstance(void) = default;
724     virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
725     virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
726         -> de::MovePtr<BufferWithMemory> override;
727 
728     virtual auto iterate(void) -> tcu::TestStatus override;
729     auto calculateAndLogResultEx(add_ref<tcu::TestLog> log, const tcu::UVec4 *result,
730                                  const std::vector<tcu::UVec4> &ref, const uint32_t maxLoc, const PrintMode printMode)
731         -> qpTestResult_e;
732 };
733 
734 class ReconvergenceTestTessCtrlInstance : public ReconvergenceTestGraphicsInstance
735 {
736 public:
ReconvergenceTestTessCtrlInstance(Context & context,const CaseDef & data)737     ReconvergenceTestTessCtrlInstance(Context &context, const CaseDef &data)
738         : ReconvergenceTestGraphicsInstance(context, data)
739     {
740     }
741     ~ReconvergenceTestTessCtrlInstance(void) = default;
742     virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
743     virtual auto iterate(void) -> tcu::TestStatus override;
744 };
745 
746 class ReconvergenceTestTessEvalInstance : public ReconvergenceTestGraphicsInstance
747 {
748 public:
ReconvergenceTestTessEvalInstance(Context & context,add_cref<CaseDef> data)749     ReconvergenceTestTessEvalInstance(Context &context, add_cref<CaseDef> data)
750         : ReconvergenceTestGraphicsInstance(context, data)
751     {
752     }
753     ~ReconvergenceTestTessEvalInstance(void) = default;
754     virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
755     virtual auto iterate(void) -> tcu::TestStatus override;
756 };
757 
758 class ReconvergenceTestGeometryInstance : public ReconvergenceTestGraphicsInstance
759 {
760 public:
ReconvergenceTestGeometryInstance(Context & context,add_cref<CaseDef> data)761     ReconvergenceTestGeometryInstance(Context &context, add_cref<CaseDef> data)
762         : ReconvergenceTestGraphicsInstance(context, data)
763     {
764     }
765     ~ReconvergenceTestGeometryInstance(void) = default;
766     virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
767     virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
768         -> de::MovePtr<BufferWithMemory> override;
769 
770     virtual auto iterate(void) -> tcu::TestStatus override;
771     auto calculateAndLogResultEx(add_ref<tcu::TestLog> log, const tcu::UVec4 *result,
772                                  const std::vector<tcu::UVec4> &ref, const uint32_t maxLoc, const PrintMode printMode)
773         -> qpTestResult_e;
774 };
775 
createGraphicsPipeline(const VkPipelineLayout pipelineLayout,const VkRenderPass renderPass,const uint32_t width,const uint32_t height,const Shaders & shaders,const VkPrimitiveTopology topology,const uint32_t patchControlPoints)776 Move<VkPipeline> ReconvergenceTestInstance::createGraphicsPipeline(const VkPipelineLayout pipelineLayout,
777                                                                    const VkRenderPass renderPass, const uint32_t width,
778                                                                    const uint32_t height, const Shaders &shaders,
779                                                                    const VkPrimitiveTopology topology,
780                                                                    const uint32_t patchControlPoints)
781 {
782     const DeviceInterface &vkd = m_context.getDeviceInterface();
783     const VkDevice device      = m_context.getDevice();
784     const uint32_t subpass     = 0;
785 
786     const std::vector<VkViewport> viewports{makeViewport(width, height)};
787     const std::vector<VkRect2D> scissors{makeRect2D(width, height)};
788 
789     enum ShaderIndex
790     {
791         IVERT = 0,
792         IFRAG,
793         ITESC,
794         ITESE,
795         IGEOM
796     };
797     VkShaderModule handles[5] = {DE_NULL}; // { vert, frag, tesc, tese, geom }
798 
799     for (uint32_t i = 0; i < (uint32_t)ARRAYSIZE(handles); ++i)
800     {
801         handles[i] = (i < (uint32_t)shaders.size()) ? *shaders[i] : DE_NULL;
802     }
803 
804     return makeGraphicsPipeline(vkd, device, pipelineLayout, handles[IVERT], handles[ITESC], handles[ITESE],
805                                 handles[IGEOM], handles[IFRAG], renderPass, viewports, scissors, topology, subpass,
806                                 patchControlPoints);
807 }
808 
createComputePipeline(const VkPipelineLayout pipelineLayout,const VkShaderModule computeShader)809 Move<VkPipeline> ReconvergenceTestInstance::createComputePipeline(const VkPipelineLayout pipelineLayout,
810                                                                   const VkShaderModule computeShader)
811 {
812     const DeviceInterface &vk = m_context.getDeviceInterface();
813     const VkDevice device     = m_context.getDevice();
814 
815     const uint32_t specData[2]                                               = {m_data.sizeX, m_data.sizeY};
816     const vk::VkSpecializationMapEntry entries[DE_LENGTH_OF_ARRAY(specData)] = {
817         {0, (uint32_t)(sizeof(uint32_t) * 0), sizeof(uint32_t)},
818         {1, (uint32_t)(sizeof(uint32_t) * 1), sizeof(uint32_t)},
819     };
820     const vk::VkSpecializationInfo specInfo = {
821         DE_LENGTH_OF_ARRAY(entries), // mapEntryCount
822         entries,                     // pMapEntries
823         sizeof(specData),            // dataSize
824         specData                     // pData
825     };
826 
827     const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroupSizeCreateInfo = {
828         VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, // VkStructureType sType;
829         DE_NULL,                                                                        // void* pNext;
830         m_subgroupSize // uint32_t requiredSubgroupSize;
831     };
832 
833     const VkBool32 computeFullSubgroups =
834         m_subgroupSize <= 64 && m_context.getSubgroupSizeControlFeatures().computeFullSubgroups;
835 
836     const void *shaderPNext = computeFullSubgroups ? &subgroupSizeCreateInfo : DE_NULL;
837     VkPipelineShaderStageCreateFlags pipelineShaderStageCreateFlags =
838         (VkPipelineShaderStageCreateFlags)(computeFullSubgroups ?
839                                                VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT :
840                                                0);
841 
842     const VkPipelineShaderStageCreateInfo shaderCreateInfo = {
843         VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
844         shaderPNext,
845         pipelineShaderStageCreateFlags,
846         VK_SHADER_STAGE_COMPUTE_BIT, // stage
847         computeShader,               // shader
848         "main",
849         &specInfo, // pSpecializationInfo
850     };
851 
852     const VkComputePipelineCreateInfo pipelineCreateInfo = {
853         VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
854         DE_NULL,
855         0u,                // flags
856         shaderCreateInfo,  // cs
857         pipelineLayout,    // layout
858         (vk::VkPipeline)0, // basePipelineHandle
859         0u,                // basePipelineIndex
860     };
861 
862     return vk::createComputePipeline(vk, device, DE_NULL, &pipelineCreateInfo, NULL);
863 }
864 
865 typedef enum
866 {
867     // store subgroupBallot().
868     // For OP_BALLOT, OP::caseValue is initialized to zero, and then
869     // set to 1 by simulate if the ballot is not workgroup- (or subgroup-_uniform.
870     // Only workgroup-uniform ballots are validated for correctness in
871     // WUCF modes.
872     OP_BALLOT,
873 
874     // store literal constant
875     OP_STORE,
876 
877     // if ((1ULL << gl_SubgroupInvocationID) & mask).
878     // Special case if mask = ~0ULL, converted into "if (inputA.a[idx] == idx)"
879     OP_IF_MASK,
880     OP_ELSE_MASK,
881     OP_ENDIF,
882 
883     // if (gl_SubgroupInvocationID == loopIdxN) (where N is most nested loop counter)
884     OP_IF_LOOPCOUNT,
885     OP_ELSE_LOOPCOUNT,
886 
887     // if (gl_LocalInvocationIndex >= inputA.a[N]) (where N is most nested loop counter)
888     OP_IF_LOCAL_INVOCATION_INDEX,
889     OP_ELSE_LOCAL_INVOCATION_INDEX,
890 
891     // break/continue
892     OP_BREAK,
893     OP_CONTINUE,
894 
895     // if (subgroupElect())
896     OP_ELECT,
897 
898     // Loop with uniform number of iterations (read from a buffer)
899     OP_BEGIN_FOR_UNIF,
900     OP_END_FOR_UNIF,
901 
902     // for (int loopIdxN = 0; loopIdxN < gl_SubgroupInvocationID + 1; ++loopIdxN)
903     OP_BEGIN_FOR_VAR,
904     OP_END_FOR_VAR,
905 
906     // for (int loopIdxN = 0;; ++loopIdxN, OP_BALLOT)
907     // Always has an "if (subgroupElect()) break;" inside.
908     // Does the equivalent of OP_BALLOT in the continue construct
909     OP_BEGIN_FOR_INF,
910     OP_END_FOR_INF,
911 
912     // do { loopIdxN++; ... } while (loopIdxN < uniformValue);
913     OP_BEGIN_DO_WHILE_UNIF,
914     OP_END_DO_WHILE_UNIF,
915 
916     // do { ... } while (true);
917     // Always has an "if (subgroupElect()) break;" inside
918     OP_BEGIN_DO_WHILE_INF,
919     OP_END_DO_WHILE_INF,
920 
921     // return;
922     OP_RETURN,
923 
924     // function call (code bracketed by these is extracted into a separate function)
925     OP_CALL_BEGIN,
926     OP_CALL_END,
927 
928     // switch statement on uniform value
929     OP_SWITCH_UNIF_BEGIN,
930     // switch statement on gl_SubgroupInvocationID & 3 value
931     OP_SWITCH_VAR_BEGIN,
932     // switch statement on loopIdx value
933     OP_SWITCH_LOOP_COUNT_BEGIN,
934 
935     // case statement with a (invocation mask, case mask) pair
936     OP_CASE_MASK_BEGIN,
937     // case statement used for loop counter switches, with a value and a mask of loop iterations
938     OP_CASE_LOOP_COUNT_BEGIN,
939 
940     // end of switch/case statement
941     OP_SWITCH_END,
942     OP_CASE_END,
943 
944     // Extra code with no functional effect. Currently inculdes:
945     // - value 0: while (!subgroupElect()) {}
946     // - value 1: if (condition_that_is_false) { infinite loop }
947     OP_NOISE,
948 
949     // do nothing, only markup
950     OP_NOP
951 } OPType;
952 
OPtypeToStr(const OPType op)953 const char *OPtypeToStr(const OPType op)
954 {
955 #define MAKETEXT(s__) #s__
956 #define CASETEXT(e__) \
957     case e__:         \
958         return MAKETEXT(e__)
959     switch (op)
960     {
961         CASETEXT(OP_BALLOT);
962         CASETEXT(OP_STORE);
963         CASETEXT(OP_IF_MASK);
964         CASETEXT(OP_ELSE_MASK);
965         CASETEXT(OP_ENDIF);
966         CASETEXT(OP_IF_LOOPCOUNT);
967         CASETEXT(OP_ELSE_LOOPCOUNT);
968         CASETEXT(OP_IF_LOCAL_INVOCATION_INDEX);
969         CASETEXT(OP_ELSE_LOCAL_INVOCATION_INDEX);
970         CASETEXT(OP_BREAK);
971         CASETEXT(OP_CONTINUE);
972         CASETEXT(OP_ELECT);
973         CASETEXT(OP_BEGIN_FOR_UNIF);
974         CASETEXT(OP_END_FOR_UNIF);
975         CASETEXT(OP_BEGIN_FOR_VAR);
976         CASETEXT(OP_END_FOR_VAR);
977         CASETEXT(OP_BEGIN_FOR_INF);
978         CASETEXT(OP_END_FOR_INF);
979         CASETEXT(OP_BEGIN_DO_WHILE_UNIF);
980         CASETEXT(OP_END_DO_WHILE_UNIF);
981         CASETEXT(OP_BEGIN_DO_WHILE_INF);
982         CASETEXT(OP_END_DO_WHILE_INF);
983         CASETEXT(OP_RETURN);
984         CASETEXT(OP_CALL_BEGIN);
985         CASETEXT(OP_CALL_END);
986         CASETEXT(OP_SWITCH_UNIF_BEGIN);
987         CASETEXT(OP_SWITCH_VAR_BEGIN);
988         CASETEXT(OP_SWITCH_LOOP_COUNT_BEGIN);
989         CASETEXT(OP_CASE_MASK_BEGIN);
990         CASETEXT(OP_CASE_LOOP_COUNT_BEGIN);
991         CASETEXT(OP_SWITCH_END);
992         CASETEXT(OP_CASE_END);
993         CASETEXT(OP_NOISE);
994         CASETEXT(OP_NOP);
995     }
996     return "<Unknown>";
997 }
998 
999 typedef enum
1000 {
1001     // Different if test conditions
1002     IF_MASK,
1003     IF_UNIFORM,
1004     IF_LOOPCOUNT,
1005     IF_LOCAL_INVOCATION_INDEX,
1006 } IFType;
1007 
1008 class OP
1009 {
1010 public:
OP(OPType _type,uint64_t _value,uint32_t _caseValue=0)1011     OP(OPType _type, uint64_t _value, uint32_t _caseValue = 0)
1012         : type(_type)
1013         , value(_value)
1014         // by default, initialized only lower part with a repetition of _value
1015         , bvalue(tcu::UVec4(uint32_t(_value), uint32_t(_value >> 32), uint32_t(_value), uint32_t(_value >> 32)))
1016         , caseValue(_caseValue)
1017     {
1018     }
1019 
1020     // The type of operation and an optional value.
1021     // The value could be a mask for an if test, the index of the loop
1022     // header for an end of loop, or the constant value for a store instruction
1023     OPType type;
1024     uint64_t value;
1025     Ballot bvalue;
1026     uint32_t caseValue;
1027 };
1028 
1029 class RandomProgram
1030 {
1031 
1032 public:
RandomProgram(const CaseDef & c,uint32_t invocationCount=0u)1033     RandomProgram(const CaseDef &c, uint32_t invocationCount = 0u)
1034         : caseDef(c)
1035         , invocationStride(invocationCount ? invocationCount : (c.sizeX * c.sizeY))
1036         , rnd()
1037         , ops()
1038         , masks()
1039         , ballotMasks()
1040         , numMasks(5)
1041         , nesting(0)
1042         , maxNesting(c.maxNesting)
1043         , loopNesting(0)
1044         , loopNestingThisFunction(0)
1045         , callNesting(0)
1046         , minCount(30)
1047         , indent(0)
1048         , isLoopInf(100, false)
1049         , doneInfLoopBreak(100, false)
1050         , storeBase(0x10000)
1051     {
1052         deRandom_init(&rnd, caseDef.seed);
1053         for (int i = 0; i < numMasks; ++i)
1054         {
1055             const uint64_t lo = deRandom_getUint64(&rnd);
1056             const uint64_t hi = deRandom_getUint64(&rnd);
1057             const tcu::UVec4 v4(uint32_t(lo), uint32_t(lo >> 32), uint32_t(hi), uint32_t(hi >> 32));
1058             ballotMasks.emplace_back(v4);
1059             masks.push_back(lo);
1060         }
1061     }
1062     virtual ~RandomProgram() = default;
1063 
1064     const CaseDef caseDef;
1065     const uint32_t invocationStride;
1066     deRandom rnd;
1067     vector<OP> ops;
1068     vector<uint64_t> masks;
1069     vector<Ballot> ballotMasks;
1070     int32_t numMasks;
1071     int32_t nesting;
1072     int32_t maxNesting;
1073     int32_t loopNesting;
1074     int32_t loopNestingThisFunction;
1075     int32_t callNesting;
1076     int32_t minCount;
1077     int32_t indent;
1078     vector<bool> isLoopInf;
1079     vector<bool> doneInfLoopBreak;
1080     // Offset the value we use for OP_STORE, to avoid colliding with fully converged
1081     // active masks with small subgroup sizes (e.g. with subgroupSize == 4, the SUCF
1082     // tests need to know that 0xF is really an active mask).
1083     int32_t storeBase;
1084 
genIf(IFType ifType,uint32_t maxLocalIndexCmp=0u)1085     virtual void genIf(IFType ifType, uint32_t maxLocalIndexCmp = 0u)
1086     {
1087         uint32_t maskIdx = deRandom_getUint32(&rnd) % numMasks;
1088         uint64_t mask    = masks[maskIdx];
1089         Ballot bmask     = ballotMasks[maskIdx];
1090         if (ifType == IF_UNIFORM)
1091         {
1092             mask = ~0ULL;
1093             bmask.set();
1094         }
1095 
1096         uint32_t localIndexCmp = deRandom_getUint32(&rnd) % (maxLocalIndexCmp ? maxLocalIndexCmp : invocationStride);
1097         if (ifType == IF_LOCAL_INVOCATION_INDEX)
1098             ops.push_back({OP_IF_LOCAL_INVOCATION_INDEX, localIndexCmp});
1099         else if (ifType == IF_LOOPCOUNT)
1100             ops.push_back({OP_IF_LOOPCOUNT, 0});
1101         else
1102         {
1103             ops.push_back({OP_IF_MASK, mask});
1104             ops.back().bvalue = bmask;
1105         }
1106 
1107         nesting++;
1108 
1109         size_t thenBegin = ops.size();
1110         pickOP(2);
1111         size_t thenEnd = ops.size();
1112 
1113         uint32_t randElse = (deRandom_getUint32(&rnd) % 100);
1114         if (randElse < 50)
1115         {
1116             if (ifType == IF_LOCAL_INVOCATION_INDEX)
1117                 ops.push_back({OP_ELSE_LOCAL_INVOCATION_INDEX, localIndexCmp});
1118             else if (ifType == IF_LOOPCOUNT)
1119                 ops.push_back({OP_ELSE_LOOPCOUNT, 0});
1120             else
1121                 ops.push_back({OP_ELSE_MASK, 0});
1122 
1123             if (randElse < 10)
1124             {
1125                 // Sometimes make the else block identical to the then block
1126                 for (size_t i = thenBegin; i < thenEnd; ++i)
1127                     ops.push_back(ops[i]);
1128             }
1129             else
1130                 pickOP(2);
1131         }
1132         ops.push_back({OP_ENDIF, 0});
1133         nesting--;
1134     }
1135 
genForUnif()1136     void genForUnif()
1137     {
1138         uint32_t iterCount = (deRandom_getUint32(&rnd) % 5) + 1;
1139         ops.push_back({OP_BEGIN_FOR_UNIF, iterCount});
1140         uint32_t loopheader = (uint32_t)ops.size() - 1;
1141         nesting++;
1142         loopNesting++;
1143         loopNestingThisFunction++;
1144         pickOP(2);
1145         ops.push_back({OP_END_FOR_UNIF, loopheader});
1146         loopNestingThisFunction--;
1147         loopNesting--;
1148         nesting--;
1149     }
1150 
genDoWhileUnif()1151     void genDoWhileUnif()
1152     {
1153         uint32_t iterCount = (deRandom_getUint32(&rnd) % 5) + 1;
1154         ops.push_back({OP_BEGIN_DO_WHILE_UNIF, iterCount});
1155         uint32_t loopheader = (uint32_t)ops.size() - 1;
1156         nesting++;
1157         loopNesting++;
1158         loopNestingThisFunction++;
1159         pickOP(2);
1160         ops.push_back({OP_END_DO_WHILE_UNIF, loopheader});
1161         loopNestingThisFunction--;
1162         loopNesting--;
1163         nesting--;
1164     }
1165 
genForVar()1166     void genForVar()
1167     {
1168         ops.push_back({OP_BEGIN_FOR_VAR, 0});
1169         uint32_t loopheader = (uint32_t)ops.size() - 1;
1170         nesting++;
1171         loopNesting++;
1172         loopNestingThisFunction++;
1173         pickOP(2);
1174         ops.push_back({OP_END_FOR_VAR, loopheader});
1175         loopNestingThisFunction--;
1176         loopNesting--;
1177         nesting--;
1178     }
1179 
genForInf()1180     void genForInf()
1181     {
1182         ops.push_back({OP_BEGIN_FOR_INF, 0});
1183         uint32_t loopheader = (uint32_t)ops.size() - 1;
1184 
1185         nesting++;
1186         loopNesting++;
1187         loopNestingThisFunction++;
1188         isLoopInf[loopNesting]        = true;
1189         doneInfLoopBreak[loopNesting] = false;
1190 
1191         pickOP(2);
1192 
1193         genElect(true);
1194         doneInfLoopBreak[loopNesting] = true;
1195 
1196         pickOP(2);
1197 
1198         ops.push_back({OP_END_FOR_INF, loopheader});
1199 
1200         isLoopInf[loopNesting]        = false;
1201         doneInfLoopBreak[loopNesting] = false;
1202         loopNestingThisFunction--;
1203         loopNesting--;
1204         nesting--;
1205     }
1206 
genDoWhileInf()1207     void genDoWhileInf()
1208     {
1209         ops.push_back({OP_BEGIN_DO_WHILE_INF, 0});
1210         uint32_t loopheader = (uint32_t)ops.size() - 1;
1211 
1212         nesting++;
1213         loopNesting++;
1214         loopNestingThisFunction++;
1215         isLoopInf[loopNesting]        = true;
1216         doneInfLoopBreak[loopNesting] = false;
1217 
1218         pickOP(2);
1219 
1220         genElect(true);
1221         doneInfLoopBreak[loopNesting] = true;
1222 
1223         pickOP(2);
1224 
1225         ops.push_back({OP_END_DO_WHILE_INF, loopheader});
1226 
1227         isLoopInf[loopNesting]        = false;
1228         doneInfLoopBreak[loopNesting] = false;
1229         loopNestingThisFunction--;
1230         loopNesting--;
1231         nesting--;
1232     }
1233 
genBreak()1234     void genBreak()
1235     {
1236         if (loopNestingThisFunction > 0)
1237         {
1238             // Sometimes put the break in a divergent if
1239             if ((deRandom_getUint32(&rnd) % 100) < 10)
1240             {
1241                 ops.push_back({OP_IF_MASK, masks[0]});
1242                 ops.back().bvalue = ballotMasks[0];
1243                 ops.push_back({OP_BREAK, 0});
1244                 ops.push_back({OP_ELSE_MASK, 0});
1245                 ops.push_back({OP_BREAK, 0});
1246                 ops.push_back({OP_ENDIF, 0});
1247             }
1248             else
1249                 ops.push_back({OP_BREAK, 0});
1250         }
1251     }
1252 
genContinue()1253     void genContinue()
1254     {
1255         // continues are allowed if we're in a loop and the loop is not infinite,
1256         // or if it is infinite and we've already done a subgroupElect+break.
1257         // However, adding more continues seems to reduce the failure rate, so
1258         // disable it for now
1259         if (loopNestingThisFunction > 0 && !(isLoopInf[loopNesting] /*&& !doneInfLoopBreak[loopNesting]*/))
1260         {
1261             // Sometimes put the continue in a divergent if
1262             if ((deRandom_getUint32(&rnd) % 100) < 10)
1263             {
1264                 ops.push_back({OP_IF_MASK, masks[0]});
1265                 ops.back().bvalue = ballotMasks[0];
1266                 ops.push_back({OP_CONTINUE, 0});
1267                 ops.push_back({OP_ELSE_MASK, 0});
1268                 ops.push_back({OP_CONTINUE, 0});
1269                 ops.push_back({OP_ENDIF, 0});
1270             }
1271             else
1272                 ops.push_back({OP_CONTINUE, 0});
1273         }
1274     }
1275 
1276     // doBreak is used to generate "if (subgroupElect()) { ... break; }" inside infinite loops
genElect(bool doBreak)1277     void genElect(bool doBreak)
1278     {
1279         ops.push_back({OP_ELECT, 0});
1280         nesting++;
1281         if (doBreak)
1282         {
1283             // Put something interestign before the break
1284             genBallot();
1285             genBallot();
1286             if ((deRandom_getUint32(&rnd) % 100) < 10)
1287                 pickOP(1);
1288 
1289             // if we're in a function, sometimes  use return instead
1290             if (callNesting > 0 && (deRandom_getUint32(&rnd) % 100) < 30)
1291                 ops.push_back({OP_RETURN, 0});
1292             else
1293                 genBreak();
1294         }
1295         else
1296             pickOP(2);
1297 
1298         ops.push_back({OP_ENDIF, 0});
1299         nesting--;
1300     }
1301 
genReturn()1302     void genReturn()
1303     {
1304         uint32_t r = deRandom_getUint32(&rnd) % 100;
1305         if (nesting > 0 &&
1306             // Use return rarely in main, 20% of the time in a singly nested loop in a function
1307             // and 50% of the time in a multiply nested loop in a function
1308             (r < 5 || (callNesting > 0 && loopNestingThisFunction > 0 && r < 20) ||
1309              (callNesting > 0 && loopNestingThisFunction > 1 && r < 50)))
1310         {
1311             genBallot();
1312             if ((deRandom_getUint32(&rnd) % 100) < 10)
1313             {
1314                 ops.push_back({OP_IF_MASK, masks[0]});
1315                 ops.back().bvalue = ballotMasks[0];
1316                 ops.push_back({OP_RETURN, 0});
1317                 ops.push_back({OP_ELSE_MASK, 0});
1318                 ops.push_back({OP_RETURN, 0});
1319                 ops.push_back({OP_ENDIF, 0});
1320             }
1321             else
1322                 ops.push_back({OP_RETURN, 0});
1323         }
1324     }
1325 
1326     // Generate a function call. Save and restore some loop information, which is used to
1327     // determine when it's safe to use break/continue
genCall()1328     void genCall()
1329     {
1330         ops.push_back({OP_CALL_BEGIN, 0});
1331         callNesting++;
1332         nesting++;
1333         int32_t saveLoopNestingThisFunction = loopNestingThisFunction;
1334         loopNestingThisFunction             = 0;
1335 
1336         pickOP(2);
1337 
1338         loopNestingThisFunction = saveLoopNestingThisFunction;
1339         nesting--;
1340         callNesting--;
1341         ops.push_back({OP_CALL_END, 0});
1342     }
1343 
1344     // Generate switch on a uniform value:
1345     // switch (inputA.a[r]) {
1346     // case r+1: ... break; // should not execute
1347     // case r:   ... break; // should branch uniformly
1348     // case r+2: ... break; // should not execute
1349     // }
genSwitchUnif()1350     void genSwitchUnif()
1351     {
1352         uint32_t r = deRandom_getUint32(&rnd) % 5;
1353         ops.push_back({OP_SWITCH_UNIF_BEGIN, r});
1354         nesting++;
1355 
1356         ops.push_back({OP_CASE_MASK_BEGIN, 0, 1u << (r + 1)});
1357         pickOP(1);
1358         ops.push_back({OP_CASE_END, 0});
1359 
1360         ops.push_back({OP_CASE_MASK_BEGIN, ~0ULL, 1u << r});
1361         ops.back().bvalue.set();
1362         pickOP(2);
1363         ops.push_back({OP_CASE_END, 0});
1364 
1365         ops.push_back({OP_CASE_MASK_BEGIN, 0, 1u << (r + 2)});
1366         pickOP(1);
1367         ops.push_back({OP_CASE_END, 0});
1368 
1369         ops.push_back({OP_SWITCH_END, 0});
1370         nesting--;
1371     }
1372 
1373     // switch (gl_SubgroupInvocationID & 3) with four unique targets
genSwitchVar()1374     void genSwitchVar()
1375     {
1376         ops.push_back({OP_SWITCH_VAR_BEGIN, 0});
1377         nesting++;
1378 
1379         ops.push_back({OP_CASE_MASK_BEGIN, 0x1111111111111111ULL, 1 << 0});
1380         ops.back().bvalue = tcu::UVec4(0x11111111);
1381         pickOP(1);
1382         ops.push_back({OP_CASE_END, 0});
1383 
1384         ops.push_back({OP_CASE_MASK_BEGIN, 0x2222222222222222ULL, 1 << 1});
1385         ops.back().bvalue = tcu::UVec4(0x22222222);
1386         pickOP(1);
1387         ops.push_back({OP_CASE_END, 0});
1388 
1389         ops.push_back({OP_CASE_MASK_BEGIN, 0x4444444444444444ULL, 1 << 2});
1390         ops.back().bvalue = tcu::UVec4(0x44444444);
1391         pickOP(1);
1392         ops.push_back({OP_CASE_END, 0});
1393 
1394         ops.push_back({OP_CASE_MASK_BEGIN, 0x8888888888888888ULL, 1 << 3});
1395         ops.back().bvalue = tcu::UVec4(0x88888888);
1396         pickOP(1);
1397         ops.push_back({OP_CASE_END, 0});
1398 
1399         ops.push_back({OP_SWITCH_END, 0});
1400         nesting--;
1401     }
1402 
1403     // switch (gl_SubgroupInvocationID & 3) with two shared targets.
1404     // XXX TODO: The test considers these two targets to remain converged,
1405     // though we haven't agreed to that behavior yet.
genSwitchMulticase()1406     void genSwitchMulticase()
1407     {
1408         ops.push_back({OP_SWITCH_VAR_BEGIN, 0});
1409         nesting++;
1410 
1411         ops.push_back({OP_CASE_MASK_BEGIN, 0x3333333333333333ULL, (1 << 0) | (1 << 1)});
1412         ops.back().bvalue = tcu::UVec4(0x33333333);
1413         pickOP(2);
1414         ops.push_back({OP_CASE_END, 0});
1415 
1416         ops.push_back({OP_CASE_MASK_BEGIN, 0xCCCCCCCCCCCCCCCCULL, (1 << 2) | (1 << 3)});
1417         ops.back().bvalue = tcu::UVec4(0xCCCCCCCC);
1418         pickOP(2);
1419         ops.push_back({OP_CASE_END, 0});
1420 
1421         ops.push_back({OP_SWITCH_END, 0});
1422         nesting--;
1423     }
1424 
1425     // switch (loopIdxN) {
1426     // case 1:  ... break;
1427     // case 2:  ... break;
1428     // default: ... break;
1429     // }
genSwitchLoopCount()1430     void genSwitchLoopCount()
1431     {
1432         uint32_t r = deRandom_getUint32(&rnd) % loopNesting;
1433         ops.push_back({OP_SWITCH_LOOP_COUNT_BEGIN, r});
1434         nesting++;
1435 
1436         ops.push_back({OP_CASE_LOOP_COUNT_BEGIN, 1ULL << 1, 1});
1437         ops.back().bvalue = tcu::UVec4(1 << 1, 0, 0, 0);
1438         pickOP(1);
1439         ops.push_back({OP_CASE_END, 0});
1440 
1441         ops.push_back({OP_CASE_LOOP_COUNT_BEGIN, 1ULL << 2, 2});
1442         ops.back().bvalue = tcu::UVec4(1 << 2, 0, 0, 0);
1443         pickOP(1);
1444         ops.push_back({OP_CASE_END, 0});
1445 
1446         // default:
1447         ops.push_back({OP_CASE_LOOP_COUNT_BEGIN, ~6ULL, 0xFFFFFFFF});
1448         ops.back().bvalue = tcu::UVec4(~6u, ~0u, ~0u, ~0u);
1449         pickOP(1);
1450         ops.push_back({OP_CASE_END, 0});
1451 
1452         ops.push_back({OP_SWITCH_END, 0});
1453         nesting--;
1454     }
1455 
pickOP(uint32_t count)1456     void pickOP(uint32_t count)
1457     {
1458         // Pick "count" instructions. These can recursively insert more instructions,
1459         // so "count" is just a seed
1460         for (uint32_t i = 0; i < count; ++i)
1461         {
1462             genBallot();
1463             if (nesting < maxNesting)
1464             {
1465                 uint32_t r = deRandom_getUint32(&rnd) % 11;
1466                 switch (r)
1467                 {
1468                 default:
1469                     DE_ASSERT(0);
1470                     // fallthrough
1471                 case 2:
1472                     if (loopNesting)
1473                     {
1474                         genIf(IF_LOOPCOUNT);
1475                         break;
1476                     }
1477                     // fallthrough
1478                 case 10:
1479                     genIf(IF_LOCAL_INVOCATION_INDEX);
1480                     break;
1481                 case 0:
1482                     genIf(IF_MASK);
1483                     break;
1484                 case 1:
1485                     genIf(IF_UNIFORM);
1486                     break;
1487                 case 3:
1488                 {
1489                     // don't nest loops too deeply, to avoid extreme memory usage or timeouts
1490                     if (loopNesting <= 3)
1491                     {
1492                         uint32_t r2 = deRandom_getUint32(&rnd) % 3;
1493                         switch (r2)
1494                         {
1495                         default:
1496                             DE_ASSERT(0); // fallthrough
1497                         case 0:
1498                             genForUnif();
1499                             break;
1500                         case 1:
1501                             genForInf();
1502                             break;
1503                         case 2:
1504                             genForVar();
1505                             break;
1506                         }
1507                     }
1508                 }
1509                 break;
1510                 case 4:
1511                     genBreak();
1512                     break;
1513                 case 5:
1514                     genContinue();
1515                     break;
1516                 case 6:
1517                     genElect(false);
1518                     break;
1519                 case 7:
1520                 {
1521                     uint32_t r2 = deRandom_getUint32(&rnd) % 5;
1522                     if (r2 == 0 && callNesting == 0 && nesting < maxNesting - 2)
1523                         genCall();
1524                     else
1525                         genReturn();
1526                     break;
1527                 }
1528                 case 8:
1529                 {
1530                     // don't nest loops too deeply, to avoid extreme memory usage or timeouts
1531                     if (loopNesting <= 3)
1532                     {
1533                         uint32_t r2 = deRandom_getUint32(&rnd) % 2;
1534                         switch (r2)
1535                         {
1536                         default:
1537                             DE_ASSERT(0); // fallthrough
1538                         case 0:
1539                             genDoWhileUnif();
1540                             break;
1541                         case 1:
1542                             genDoWhileInf();
1543                             break;
1544                         }
1545                     }
1546                 }
1547                 break;
1548                 case 9:
1549                 {
1550                     uint32_t r2 = deRandom_getUint32(&rnd) % 4;
1551                     switch (r2)
1552                     {
1553                     default:
1554                         DE_ASSERT(0);
1555                         // fallthrough
1556                     case 0:
1557                         genSwitchUnif();
1558                         break;
1559                     case 1:
1560                         if (loopNesting > 0)
1561                         {
1562                             genSwitchLoopCount();
1563                             break;
1564                         }
1565                         // fallthrough
1566                     case 2:
1567                         if (caseDef.testType != TT_MAXIMAL)
1568                         {
1569                             // multicase doesn't have fully-defined behavior for MAXIMAL tests,
1570                             // but does for SUCF tests
1571                             genSwitchMulticase();
1572                             break;
1573                         }
1574                         // fallthrough
1575                     case 3:
1576                         genSwitchVar();
1577                         break;
1578                     }
1579                 }
1580                 break;
1581                 }
1582             }
1583             genBallot();
1584         }
1585     }
1586 
genBallot()1587     void genBallot()
1588     {
1589         // optionally insert ballots, stores, and noise. Ballots and stores are used to determine
1590         // correctness.
1591         if ((deRandom_getUint32(&rnd) % 100) < 20)
1592         {
1593             if (ops.size() < 2 || !(ops[ops.size() - 1].type == OP_BALLOT ||
1594                                     (ops[ops.size() - 1].type == OP_STORE && ops[ops.size() - 2].type == OP_BALLOT)))
1595             {
1596                 // do a store along with each ballot, so we can correlate where
1597                 // the ballot came from
1598                 if (caseDef.testType != TT_MAXIMAL)
1599                     ops.push_back({OP_STORE, (uint32_t)ops.size() + storeBase});
1600                 ops.push_back({OP_BALLOT, 0});
1601             }
1602         }
1603 
1604         if ((deRandom_getUint32(&rnd) % 100) < 10)
1605         {
1606             if (ops.size() < 2 || !(ops[ops.size() - 1].type == OP_STORE ||
1607                                     (ops[ops.size() - 1].type == OP_BALLOT && ops[ops.size() - 2].type == OP_STORE)))
1608             {
1609                 // SUCF does a store with every ballot. Don't bloat the code by adding more.
1610                 if (caseDef.testType == TT_MAXIMAL)
1611                     ops.push_back({OP_STORE, (uint32_t)ops.size() + storeBase});
1612             }
1613         }
1614 
1615         uint32_t r = deRandom_getUint32(&rnd) % 10000;
1616         if (r < 3)
1617             ops.push_back({OP_NOISE, 0});
1618         else if (r < 10)
1619             ops.push_back({OP_NOISE, 1});
1620     }
1621 
generateRandomProgram(qpWatchDog * watchDog,add_ref<tcu::TestLog> log)1622     std::map<uint32_t, uint32_t> generateRandomProgram(qpWatchDog *watchDog, add_ref<tcu::TestLog> log)
1623     {
1624         std::vector<tcu::UVec4> ref;
1625         std::map<uint32_t, uint32_t> subgroupSizeToMaxLoc;
1626 
1627         do
1628         {
1629             ops.clear();
1630             while ((int32_t)ops.size() < minCount)
1631                 pickOP(1);
1632 
1633             // Retry until the program has some UCF results in it
1634             if (caseDef.isUCF())
1635             {
1636                 // Simulate for all subgroup sizes, to determine whether OP_BALLOTs are nonuniform
1637                 for (int32_t subgroupSize = 4; subgroupSize <= 128; subgroupSize *= 2)
1638                 {
1639                     //simulate(true, subgroupSize, ref);
1640                     const uint32_t maxLoc = execute(watchDog, true, subgroupSize, 0u, invocationStride, ref, log);
1641                     subgroupSizeToMaxLoc[subgroupSize] = maxLoc;
1642                 }
1643             }
1644         } while (caseDef.isUCF() && !hasUCF());
1645 
1646         return subgroupSizeToMaxLoc;
1647     }
1648 
printIndent(std::stringstream & css)1649     void printIndent(std::stringstream &css)
1650     {
1651         for (int32_t i = 0; i < indent; ++i)
1652             css << " ";
1653     }
1654 
1655     struct FlowState
1656     {
1657         add_cref<vector<OP>> ops;
1658         const int32_t opsIndex;
1659         const int32_t loopNesting;
1660         const int funcNum;
1661     };
1662 
1663     // State of the subgroup at each level of nesting
1664     struct SubgroupState
1665     {
1666         // Currently executing
1667         bitset_inv_t activeMask;
1668         // Have executed a continue instruction in this loop
1669         bitset_inv_t continueMask;
1670         // index of the current if test or loop header
1671         uint32_t header;
1672         // number of loop iterations performed
1673         uint32_t tripCount;
1674         // is this nesting a loop?
1675         uint32_t isLoop;
1676         // is this nesting a function call?
1677         uint32_t isCall;
1678         // is this nesting a switch?
1679         uint32_t isSwitch;
1680     };
1681 
1682     struct SubgroupState2
1683     {
1684         // Currently executing
1685         Ballots activeMask;
1686         // Have executed a continue instruction in this loop
1687         Ballots continueMask;
1688         // index of the current if test or loop header
1689         uint32_t header;
1690         // number of loop iterations performed
1691         uint32_t tripCount;
1692         // is this nesting a loop?
1693         uint32_t isLoop;
1694         // is this nesting a function call?
1695         uint32_t isCall;
1696         // is this nesting a switch?
1697         uint32_t isSwitch;
1698         virtual ~SubgroupState2() = default;
SubgroupState2vkt::Reconvergence::__anon4f2394780111::RandomProgram::SubgroupState21699         SubgroupState2() : SubgroupState2(0)
1700         {
1701         }
SubgroupState2vkt::Reconvergence::__anon4f2394780111::RandomProgram::SubgroupState21702         SubgroupState2(uint32_t subgroupCount)
1703             : activeMask(subgroupCount)
1704             , continueMask(subgroupCount)
1705             , header()
1706             , tripCount()
1707             , isLoop()
1708             , isCall()
1709             , isSwitch()
1710         {
1711         }
1712     };
1713 
1714     struct Prerequisites
1715     {
1716     };
1717 
getPartitionBallotText()1718     virtual std::string getPartitionBallotText()
1719     {
1720         return "subgroupBallot(true)";
1721     }
1722 
printIfLocalInvocationIndex(std::stringstream & css,add_cref<FlowState> flow)1723     virtual void printIfLocalInvocationIndex(std::stringstream &css, add_cref<FlowState> flow)
1724     {
1725         printIndent(css);
1726         css << "if (gl_LocalInvocationIndex >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
1727     }
1728 
printStore(std::stringstream & css,add_cref<FlowState> flow)1729     virtual void printStore(std::stringstream &css, add_cref<FlowState> flow)
1730     {
1731         printIndent(css);
1732         css << "outputC.loc[gl_LocalInvocationIndex]++;\n";
1733         printIndent(css);
1734         css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex].x = 0x" << std::hex
1735             << flow.ops[flow.opsIndex].value << ";\n";
1736     }
1737 
printBallot(std::stringstream & css,add_cref<FlowState>,bool endWithSemicolon=false)1738     virtual void printBallot(std::stringstream &css, add_cref<FlowState>, bool endWithSemicolon = false)
1739     {
1740         printIndent(css);
1741 
1742         css << "outputC.loc[gl_LocalInvocationIndex]++,";
1743         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
1744         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
1745         // subgroup_uniform_control_flow, since we only validate results that must be fully
1746         // reconverged.
1747         if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
1748         {
1749             css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = " << getPartitionBallotText()
1750                 << ".xy";
1751         }
1752         else if (caseDef.isElect())
1753         {
1754             css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex].x = elect()";
1755         }
1756         else
1757         {
1758             css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = subgroupBallot(true).xy";
1759         }
1760         if (endWithSemicolon)
1761         {
1762             css << ";\n";
1763         }
1764     }
1765 
printCode(std::stringstream & functions,std::stringstream & main)1766     void printCode(std::stringstream &functions, std::stringstream &main)
1767     {
1768         std::stringstream *css = &main;
1769         indent                 = 4;
1770         loopNesting            = 0;
1771         int funcNum            = 0;
1772         int32_t i              = 0;
1773 
1774         auto makeFlowState = [&]() -> FlowState { return FlowState{ops, i, loopNesting, funcNum}; };
1775 
1776         for (; i < (int32_t)ops.size(); ++i)
1777         {
1778             switch (ops[i].type)
1779             {
1780             case OP_IF_MASK:
1781                 printIndent(*css);
1782                 if (ops[i].value == ~0ULL)
1783                 {
1784                     // This equality test will always succeed, since inputA.a[i] == i
1785                     int idx = deRandom_getUint32(&rnd) % 4;
1786                     *css << "if (inputA.a[" << idx << "] == " << idx << ") {\n";
1787                 }
1788                 else
1789                 {
1790                     const tcu::UVec4 v(ops[i].bvalue);
1791                     *css << std::hex << "if (testBit(uvec4("
1792                          << "0x" << v.x() << ", "
1793                          << "0x" << v.y() << ", "
1794                          << "0x" << v.z() << ", "
1795                          << "0x" << v.w() << std::dec << "), gl_SubgroupInvocationID)) {\n";
1796                 }
1797                 indent += 4;
1798                 break;
1799             case OP_IF_LOOPCOUNT:
1800                 printIndent(*css);
1801                 *css << "if (gl_SubgroupInvocationID == loopIdx" << loopNesting - 1 << ") {\n";
1802                 indent += 4;
1803                 break;
1804             case OP_IF_LOCAL_INVOCATION_INDEX:
1805                 printIfLocalInvocationIndex(*css, makeFlowState());
1806                 indent += 4;
1807                 break;
1808             case OP_ELSE_MASK:
1809             case OP_ELSE_LOOPCOUNT:
1810             case OP_ELSE_LOCAL_INVOCATION_INDEX:
1811                 indent -= 4;
1812                 printIndent(*css);
1813                 *css << "} else {\n";
1814                 indent += 4;
1815                 break;
1816             case OP_ENDIF:
1817                 indent -= 4;
1818                 printIndent(*css);
1819                 *css << "}\n";
1820                 break;
1821             case OP_BALLOT:
1822                 printBallot(*css, makeFlowState(), true);
1823                 break;
1824             case OP_STORE:
1825                 printStore(*css, makeFlowState());
1826                 break;
1827             case OP_BEGIN_FOR_VAR:
1828                 printIndent(*css);
1829                 *css << "for (int loopIdx" << loopNesting << " = 0;\n";
1830                 printIndent(*css);
1831                 *css << "         loopIdx" << loopNesting << " < gl_SubgroupInvocationID + 1;\n";
1832                 printIndent(*css);
1833                 *css << "         loopIdx" << loopNesting << "++) {\n";
1834                 indent += 4;
1835                 loopNesting++;
1836                 break;
1837             case OP_END_FOR_VAR:
1838                 loopNesting--;
1839                 indent -= 4;
1840                 printIndent(*css);
1841                 *css << "}\n";
1842                 break;
1843             case OP_BEGIN_FOR_UNIF:
1844                 printIndent(*css);
1845                 *css << "for (int loopIdx" << loopNesting << " = 0;\n";
1846                 printIndent(*css);
1847                 *css << "         loopIdx" << loopNesting << " < inputA.a[" << ops[i].value << "];\n";
1848                 printIndent(*css);
1849                 *css << "         loopIdx" << loopNesting << "++) {\n";
1850                 indent += 4;
1851                 loopNesting++;
1852                 break;
1853             case OP_END_FOR_UNIF:
1854                 loopNesting--;
1855                 indent -= 4;
1856                 printIndent(*css);
1857                 *css << "}\n";
1858                 break;
1859             case OP_BEGIN_FOR_INF:
1860                 printIndent(*css);
1861                 *css << "for (int loopIdx" << loopNesting << " = 0;;loopIdx" << loopNesting << "++,";
1862                 loopNesting++;
1863                 printBallot(*css, makeFlowState());
1864                 *css << ") {\n";
1865                 indent += 4;
1866                 break;
1867             case OP_END_FOR_INF:
1868                 loopNesting--;
1869                 indent -= 4;
1870                 printIndent(*css);
1871                 *css << "}\n";
1872                 break;
1873             case OP_BEGIN_DO_WHILE_UNIF:
1874                 printIndent(*css);
1875                 *css << "{\n";
1876                 indent += 4;
1877                 printIndent(*css);
1878                 *css << "int loopIdx" << loopNesting << " = 0;\n";
1879                 printIndent(*css);
1880                 *css << "do {\n";
1881                 indent += 4;
1882                 printIndent(*css);
1883                 *css << "loopIdx" << loopNesting << "++;\n";
1884                 loopNesting++;
1885                 break;
1886             case OP_END_DO_WHILE_UNIF:
1887                 loopNesting--;
1888                 indent -= 4;
1889                 printIndent(*css);
1890                 *css << "} while (loopIdx" << loopNesting << " < inputA.a[" << ops[(uint32_t)ops[i].value].value
1891                      << "]);\n";
1892                 indent -= 4;
1893                 printIndent(*css);
1894                 *css << "}\n";
1895                 break;
1896             case OP_BEGIN_DO_WHILE_INF:
1897                 printIndent(*css);
1898                 *css << "{\n";
1899                 indent += 4;
1900                 printIndent(*css);
1901                 *css << "int loopIdx" << loopNesting << " = 0;\n";
1902                 printIndent(*css);
1903                 *css << "do {\n";
1904                 indent += 4;
1905                 loopNesting++;
1906                 break;
1907             case OP_END_DO_WHILE_INF:
1908                 loopNesting--;
1909                 printIndent(*css);
1910                 *css << "loopIdx" << loopNesting << "++;\n";
1911                 indent -= 4;
1912                 printIndent(*css);
1913                 *css << "} while (true);\n";
1914                 indent -= 4;
1915                 printIndent(*css);
1916                 *css << "}\n";
1917                 break;
1918             case OP_BREAK:
1919                 printIndent(*css);
1920                 *css << "break;\n";
1921                 break;
1922             case OP_CONTINUE:
1923                 printIndent(*css);
1924                 *css << "continue;\n";
1925                 break;
1926             case OP_ELECT:
1927                 printIndent(*css);
1928                 *css << "if (subgroupElect()) {\n";
1929                 indent += 4;
1930                 break;
1931             case OP_RETURN:
1932                 printIndent(*css);
1933                 *css << "return;\n";
1934                 break;
1935             case OP_CALL_BEGIN:
1936                 printIndent(*css);
1937                 *css << "func" << funcNum << "(";
1938                 for (int32_t n = 0; n < loopNesting; ++n)
1939                 {
1940                     *css << "loopIdx" << n;
1941                     if (n != loopNesting - 1)
1942                         *css << ", ";
1943                 }
1944                 *css << ");\n";
1945                 css = &functions;
1946                 printIndent(*css);
1947                 *css << "void func" << funcNum << "(";
1948                 for (int32_t n = 0; n < loopNesting; ++n)
1949                 {
1950                     *css << "int loopIdx" << n;
1951                     if (n != loopNesting - 1)
1952                         *css << ", ";
1953                 }
1954                 *css << ") {\n";
1955                 indent += 4;
1956                 funcNum++;
1957                 break;
1958             case OP_CALL_END:
1959                 indent -= 4;
1960                 printIndent(*css);
1961                 *css << "}\n";
1962                 css = &main;
1963                 break;
1964             case OP_NOISE:
1965                 if (ops[i].value == 0)
1966                 {
1967                     printIndent(*css);
1968                     *css << "while (!subgroupElect()) {}\n";
1969                 }
1970                 else
1971                 {
1972                     printIndent(*css);
1973                     *css << "if (inputA.a[0] == 12345) {\n";
1974                     indent += 4;
1975                     printIndent(*css);
1976                     *css << "while (true) {\n";
1977                     indent += 4;
1978                     printBallot(*css, makeFlowState(), true);
1979                     indent -= 4;
1980                     printIndent(*css);
1981                     *css << "}\n";
1982                     indent -= 4;
1983                     printIndent(*css);
1984                     *css << "}\n";
1985                 }
1986                 break;
1987             case OP_SWITCH_UNIF_BEGIN:
1988                 printIndent(*css);
1989                 *css << "switch (inputA.a[" << ops[i].value << "]) {\n";
1990                 indent += 4;
1991                 break;
1992             case OP_SWITCH_VAR_BEGIN:
1993                 printIndent(*css);
1994                 *css << "switch (gl_SubgroupInvocationID & 3) {\n";
1995                 indent += 4;
1996                 break;
1997             case OP_SWITCH_LOOP_COUNT_BEGIN:
1998                 printIndent(*css);
1999                 *css << "switch (loopIdx" << ops[i].value << ") {\n";
2000                 indent += 4;
2001                 break;
2002             case OP_SWITCH_END:
2003                 indent -= 4;
2004                 printIndent(*css);
2005                 *css << "}\n";
2006                 break;
2007             case OP_CASE_MASK_BEGIN:
2008                 for (int32_t b = 0; b < 32; ++b)
2009                 {
2010                     if ((1u << b) & ops[i].caseValue)
2011                     {
2012                         printIndent(*css);
2013                         *css << "case " << b << ":\n";
2014                     }
2015                 }
2016                 printIndent(*css);
2017                 *css << "{\n";
2018                 indent += 4;
2019                 break;
2020             case OP_CASE_LOOP_COUNT_BEGIN:
2021                 if (ops[i].caseValue == 0xFFFFFFFF)
2022                 {
2023                     printIndent(*css);
2024                     *css << "default: {\n";
2025                 }
2026                 else
2027                 {
2028                     printIndent(*css);
2029                     *css << "case " << ops[i].caseValue << ": {\n";
2030                 }
2031                 indent += 4;
2032                 break;
2033             case OP_CASE_END:
2034                 printIndent(*css);
2035                 *css << "break;\n";
2036                 indent -= 4;
2037                 printIndent(*css);
2038                 *css << "}\n";
2039                 break;
2040             default:
2041                 DE_ASSERT(0);
2042                 break;
2043             }
2044         }
2045     }
2046 
2047     // Simulate execution of the program. If countOnly is true, just return
2048     // the max number of outputs written. If it's false, store out the result
2049     // values to ref.
2050     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) = 0;
2051 
execute(qpWatchDog * watchDog,bool countOnly,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,add_cref<std::vector<uint32_t>> outputP={},const tcu::UVec4 * cmp=nullptr,const uint32_t primitiveID=(~0u))2052     virtual uint32_t execute(qpWatchDog *watchDog, bool countOnly, const uint32_t subgroupSize,
2053                              const uint32_t fragmentStride, const uint32_t primitiveStride,
2054                              add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2055                              add_cref<std::vector<uint32_t>> outputP = {}, const tcu::UVec4 *cmp = nullptr,
2056                              const uint32_t primitiveID = (~0u))
2057     {
2058         // Per-invocation output location counters
2059         std::vector<uint32_t> outLoc;
2060         std::vector<SubgroupState2> stateStack;
2061         uint32_t subgroupCount;
2062         uint32_t logFailureCount;
2063         auto prerequisites = makePrerequisites(outputP, subgroupSize, fragmentStride, primitiveStride, stateStack,
2064                                                outLoc, subgroupCount);
2065         const Ballot fullSubgroupMask = subgroupSizeToMask(subgroupSize, subgroupCount);
2066 
2067         logFailureCount = 10u;
2068         nesting         = 0;
2069         loopNesting     = 0;
2070 
2071         int32_t i          = 0;
2072         uint32_t loopCount = 0;
2073 
2074         while (i < (int32_t)ops.size())
2075         {
2076             add_cref<Ballots> activeMask = stateStack[nesting].activeMask;
2077 
2078             if ((loopCount % 5000) == 0 && watchDog)
2079                 qpWatchDog_touch(watchDog);
2080 
2081             switch (ops[i].type)
2082             {
2083             case OP_BALLOT:
2084                 // Flag that this ballot is workgroup-nonuniform
2085                 if (caseDef.isWUCF() && activeMask.any() && !activeMask.all())
2086                     ops[i].caseValue = 1;
2087 
2088                 if (caseDef.isSUCF())
2089                 {
2090                     for (uint32_t id = 0; id < invocationStride; id += subgroupSize)
2091                     {
2092                         const Ballot subgroupMask = bitsetToBallot(activeMask, fullSubgroupMask, subgroupSize, id);
2093                         // Flag that this ballot is subgroup-nonuniform
2094                         if (subgroupMask != 0 && subgroupMask != fullSubgroupMask)
2095                             ops[i].caseValue = 1;
2096                     }
2097                 }
2098 
2099                 simulateBallot(countOnly, activeMask, primitiveID, i, outLoc, ref, log, prerequisites, logFailureCount,
2100                                (i > 0 ? ops[i - 1].type : OP_BALLOT), cmp);
2101                 break;
2102             case OP_STORE:
2103                 simulateStore(countOnly, stateStack[nesting].activeMask, primitiveID, ops[i].value, outLoc, ref, log,
2104                               prerequisites, logFailureCount, (i > 0 ? ops[i - 1].type : OP_STORE), cmp);
2105                 break;
2106             case OP_IF_MASK:
2107                 nesting++;
2108                 stateStack[nesting].activeMask =
2109                     stateStack[nesting - 1].activeMask & ballotsFromBallot(ops[i].bvalue, subgroupSize, subgroupCount);
2110                 stateStack[nesting].header   = i;
2111                 stateStack[nesting].isLoop   = 0;
2112                 stateStack[nesting].isSwitch = 0;
2113                 break;
2114             case OP_ELSE_MASK:
2115                 stateStack[nesting].activeMask =
2116                     stateStack[nesting - 1].activeMask &
2117                     ~ballotsFromBallot(ops[stateStack[nesting].header].bvalue, subgroupSize, subgroupCount);
2118                 break;
2119             case OP_IF_LOOPCOUNT:
2120             {
2121                 uint32_t n = nesting;
2122                 while (!stateStack[n].isLoop)
2123                     n--;
2124                 const Ballot tripBallot = Ballot::withSetBit(stateStack[n].tripCount);
2125 
2126                 nesting++;
2127                 stateStack[nesting].activeMask =
2128                     stateStack[nesting - 1].activeMask & ballotsFromBallot(tripBallot, subgroupSize, subgroupCount);
2129                 stateStack[nesting].header   = i;
2130                 stateStack[nesting].isLoop   = 0;
2131                 stateStack[nesting].isSwitch = 0;
2132                 break;
2133             }
2134             case OP_ELSE_LOOPCOUNT:
2135             {
2136                 uint32_t n = nesting;
2137                 while (!stateStack[n].isLoop)
2138                     n--;
2139                 const Ballot tripBallot = Ballot::withSetBit(stateStack[n].tripCount);
2140 
2141                 stateStack[nesting].activeMask =
2142                     stateStack[nesting - 1].activeMask & ~ballotsFromBallot(tripBallot, subgroupSize, subgroupCount);
2143                 break;
2144             }
2145             case OP_IF_LOCAL_INVOCATION_INDEX:
2146             {
2147                 // all bits >= N
2148                 Ballots mask(subgroupCount);
2149                 const uint32_t maxID = subgroupCount * subgroupSize;
2150                 for (uint32_t id = static_cast<uint32_t>(ops[i].value); id < maxID; ++id)
2151                 {
2152                     mask.set(Ballots::findBit(id, subgroupSize));
2153                 }
2154 
2155                 nesting++;
2156                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
2157                 stateStack[nesting].header     = i;
2158                 stateStack[nesting].isLoop     = 0;
2159                 stateStack[nesting].isSwitch   = 0;
2160                 break;
2161             }
2162             case OP_ELSE_LOCAL_INVOCATION_INDEX:
2163             {
2164                 // all bits < N
2165                 Ballots mask(subgroupCount);
2166                 const uint32_t maxID = subgroupCount * subgroupSize;
2167                 for (uint32_t id = 0u; id < static_cast<uint32_t>(ops[i].value) && id < maxID; ++id)
2168                 {
2169                     mask.set(Ballots::findBit(id, subgroupSize));
2170                 }
2171 
2172                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
2173                 break;
2174             }
2175             case OP_ENDIF:
2176                 nesting--;
2177                 break;
2178             case OP_BEGIN_FOR_UNIF:
2179                 // XXX TODO: We don't handle a for loop with zero iterations
2180                 nesting++;
2181                 loopNesting++;
2182                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
2183                 stateStack[nesting].header       = i;
2184                 stateStack[nesting].tripCount    = 0;
2185                 stateStack[nesting].isLoop       = 1;
2186                 stateStack[nesting].isSwitch     = 0;
2187                 stateStack[nesting].continueMask = 0;
2188                 break;
2189             case OP_END_FOR_UNIF:
2190                 stateStack[nesting].tripCount++;
2191                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2192                 stateStack[nesting].continueMask = 0;
2193                 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
2194                     stateStack[nesting].activeMask.any())
2195                 {
2196                     i = stateStack[nesting].header + 1;
2197                     continue;
2198                 }
2199                 else
2200                 {
2201                     loopNesting--;
2202                     nesting--;
2203                 }
2204                 break;
2205             case OP_BEGIN_DO_WHILE_UNIF:
2206                 // XXX TODO: We don't handle a for loop with zero iterations
2207                 nesting++;
2208                 loopNesting++;
2209                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
2210                 stateStack[nesting].header       = i;
2211                 stateStack[nesting].tripCount    = 1;
2212                 stateStack[nesting].isLoop       = 1;
2213                 stateStack[nesting].isSwitch     = 0;
2214                 stateStack[nesting].continueMask = 0;
2215                 break;
2216             case OP_END_DO_WHILE_UNIF:
2217                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2218                 stateStack[nesting].continueMask = 0;
2219                 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
2220                     stateStack[nesting].activeMask.any())
2221                 {
2222                     i = stateStack[nesting].header + 1;
2223                     stateStack[nesting].tripCount++;
2224                     continue;
2225                 }
2226                 else
2227                 {
2228                     loopNesting--;
2229                     nesting--;
2230                 }
2231                 break;
2232             case OP_BEGIN_FOR_VAR:
2233                 // XXX TODO: We don't handle a for loop with zero iterations
2234                 nesting++;
2235                 loopNesting++;
2236                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
2237                 stateStack[nesting].header       = i;
2238                 stateStack[nesting].tripCount    = 0;
2239                 stateStack[nesting].isLoop       = 1;
2240                 stateStack[nesting].isSwitch     = 0;
2241                 stateStack[nesting].continueMask = 0;
2242                 break;
2243             case OP_END_FOR_VAR:
2244             {
2245                 stateStack[nesting].tripCount++;
2246                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2247                 stateStack[nesting].continueMask = 0;
2248                 Ballot tripBallot;
2249                 if (subgroupSize != stateStack[nesting].tripCount)
2250                 {
2251                     for (uint32_t bit = stateStack[nesting].tripCount; bit < tripBallot.size(); ++bit)
2252                         tripBallot.set(bit);
2253                 }
2254                 stateStack[nesting].activeMask &= ballotsFromBallot(tripBallot, subgroupSize, subgroupCount);
2255 
2256                 if (stateStack[nesting].activeMask.any())
2257                 {
2258                     i = stateStack[nesting].header + 1;
2259                     continue;
2260                 }
2261                 else
2262                 {
2263                     loopNesting--;
2264                     nesting--;
2265                 }
2266                 break;
2267             }
2268             case OP_BEGIN_FOR_INF:
2269             case OP_BEGIN_DO_WHILE_INF:
2270                 nesting++;
2271                 loopNesting++;
2272                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
2273                 stateStack[nesting].header       = i;
2274                 stateStack[nesting].tripCount    = 0;
2275                 stateStack[nesting].isLoop       = 1;
2276                 stateStack[nesting].isSwitch     = 0;
2277                 stateStack[nesting].continueMask = 0;
2278                 break;
2279             case OP_END_FOR_INF:
2280                 stateStack[nesting].tripCount++;
2281                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2282                 stateStack[nesting].continueMask = 0;
2283                 if (stateStack[nesting].activeMask.any())
2284                 {
2285                     // output expected OP_BALLOT values
2286                     simulateBallot(countOnly, stateStack[nesting].activeMask, primitiveID, i, outLoc, ref, log,
2287                                    prerequisites, logFailureCount, (i > 0 ? ops[i - 1].type : OP_BALLOT), cmp);
2288 
2289                     i = stateStack[nesting].header + 1;
2290                     continue;
2291                 }
2292                 else
2293                 {
2294                     loopNesting--;
2295                     nesting--;
2296                 }
2297                 break;
2298             case OP_END_DO_WHILE_INF:
2299                 stateStack[nesting].tripCount++;
2300                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2301                 stateStack[nesting].continueMask = 0;
2302                 if (stateStack[nesting].activeMask.any())
2303                 {
2304                     i = stateStack[nesting].header + 1;
2305                     continue;
2306                 }
2307                 else
2308                 {
2309                     loopNesting--;
2310                     nesting--;
2311                 }
2312                 break;
2313             case OP_BREAK:
2314             {
2315                 uint32_t n         = nesting;
2316                 const Ballots mask = stateStack[nesting].activeMask;
2317                 while (true)
2318                 {
2319                     stateStack[n].activeMask &= ~mask;
2320                     if (stateStack[n].isLoop || stateStack[n].isSwitch)
2321                         break;
2322 
2323                     n--;
2324                 }
2325             }
2326             break;
2327             case OP_CONTINUE:
2328             {
2329                 uint32_t n         = nesting;
2330                 const Ballots mask = stateStack[nesting].activeMask;
2331                 while (true)
2332                 {
2333                     stateStack[n].activeMask &= ~mask;
2334                     if (stateStack[n].isLoop)
2335                     {
2336                         stateStack[n].continueMask |= mask;
2337                         break;
2338                     }
2339                     n--;
2340                 }
2341             }
2342             break;
2343             case OP_ELECT:
2344             {
2345                 nesting++;
2346                 stateStack[nesting].activeMask = bitsetElect(stateStack[nesting - 1].activeMask);
2347                 stateStack[nesting].header     = i;
2348                 stateStack[nesting].isLoop     = 0;
2349                 stateStack[nesting].isSwitch   = 0;
2350             }
2351             break;
2352             case OP_RETURN:
2353             {
2354                 const Ballots mask = stateStack[nesting].activeMask;
2355                 for (int32_t n = nesting; n >= 0; --n)
2356                 {
2357                     stateStack[n].activeMask &= ~mask;
2358                     if (stateStack[n].isCall)
2359                         break;
2360                 }
2361             }
2362             break;
2363 
2364             case OP_CALL_BEGIN:
2365                 nesting++;
2366                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2367                 stateStack[nesting].isLoop     = 0;
2368                 stateStack[nesting].isSwitch   = 0;
2369                 stateStack[nesting].isCall     = 1;
2370                 break;
2371             case OP_CALL_END:
2372                 stateStack[nesting].isCall = 0;
2373                 nesting--;
2374                 break;
2375             case OP_NOISE:
2376                 break;
2377 
2378             case OP_SWITCH_UNIF_BEGIN:
2379             case OP_SWITCH_VAR_BEGIN:
2380             case OP_SWITCH_LOOP_COUNT_BEGIN:
2381                 nesting++;
2382                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2383                 stateStack[nesting].header     = i;
2384                 stateStack[nesting].isLoop     = 0;
2385                 stateStack[nesting].isSwitch   = 1;
2386                 break;
2387             case OP_SWITCH_END:
2388                 nesting--;
2389                 break;
2390             case OP_CASE_MASK_BEGIN:
2391                 stateStack[nesting].activeMask =
2392                     stateStack[nesting - 1].activeMask & ballotsFromBallot(ops[i].bvalue, subgroupSize, subgroupCount);
2393                 break;
2394             case OP_CASE_LOOP_COUNT_BEGIN:
2395             {
2396                 uint32_t n = nesting;
2397                 uint32_t l = loopNesting;
2398 
2399                 while (true)
2400                 {
2401                     if (stateStack[n].isLoop)
2402                     {
2403                         l--;
2404                         if (l == ops[stateStack[nesting].header].value)
2405                             break;
2406                     }
2407                     n--;
2408                 }
2409 
2410                 if ((Ballot::withSetBit(stateStack[n].tripCount) & Ballot(ops[i].bvalue)).any())
2411                     stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2412                 else
2413                     stateStack[nesting].activeMask = 0;
2414                 break;
2415             }
2416             case OP_CASE_END:
2417                 break;
2418 
2419             default:
2420                 DE_ASSERT(0);
2421                 break;
2422             }
2423             i++;
2424             loopCount++;
2425         }
2426         uint32_t maxLoc = 0;
2427         for (uint32_t id = 0; id < (uint32_t)outLoc.size(); ++id)
2428             maxLoc = de::max(maxLoc, outLoc[id]);
2429 
2430         return maxLoc;
2431     }
2432 
hasUCF() const2433     bool hasUCF() const
2434     {
2435         for (int32_t i = 0; i < (int32_t)ops.size(); ++i)
2436         {
2437             if (ops[i].type == OP_BALLOT && ops[i].caseValue == 0)
2438                 return true;
2439         }
2440         return false;
2441     }
2442 
2443 protected:
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)2444     virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
2445                                                              const uint32_t subgroupSize, const uint32_t fragmentStride,
2446                                                              const uint32_t primitiveStride,
2447                                                              add_ref<std::vector<SubgroupState2>> stateStack,
2448                                                              add_ref<std::vector<uint32_t>> outLoc,
2449                                                              add_ref<uint32_t> subgroupCount)
2450     {
2451         DE_UNREF(outputP);
2452         DE_UNREF(subgroupSize);
2453         DE_UNREF(fragmentStride);
2454         DE_UNREF(primitiveStride);
2455         DE_UNREF(stateStack);
2456         DE_UNREF(outLoc);
2457         DE_UNREF(subgroupCount);
2458         return std::make_shared<Prerequisites>();
2459     }
2460 
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2461     virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2462                                 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
2463                                 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2464                                 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2465                                 const OPType reason, const tcu::UVec4 *cmp)
2466     {
2467         DE_UNREF(countOnly);
2468         DE_UNREF(activeMask);
2469         DE_UNREF(primitiveID);
2470         DE_UNREF(opsIndex);
2471         DE_UNREF(outLoc);
2472         DE_UNREF(ref);
2473         DE_UNREF(log);
2474         DE_UNREF(prerequisites);
2475         DE_UNREF(logFailureCount);
2476         DE_UNREF(reason);
2477         DE_UNREF(cmp);
2478     }
2479 
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2480     virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2481                                const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
2482                                add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2483                                std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2484                                const OPType reason, const tcu::UVec4 *cmp)
2485     {
2486         DE_UNREF(countOnly);
2487         DE_UNREF(activeMask);
2488         DE_UNREF(primitiveID);
2489         DE_UNREF(storeValue);
2490         DE_UNREF(outLoc);
2491         DE_UNREF(ref);
2492         DE_UNREF(log);
2493         DE_UNREF(prerequisites);
2494         DE_UNREF(logFailureCount);
2495         DE_UNREF(reason);
2496         DE_UNREF(cmp);
2497     }
2498 };
2499 
2500 class ComputeRandomProgram : public RandomProgram
2501 {
2502 public:
ComputeRandomProgram(const CaseDef & c)2503     ComputeRandomProgram(const CaseDef &c) : RandomProgram(c, uint32_t(c.sizeX * c.sizeY))
2504     {
2505         DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_COMPUTE_BIT);
2506     }
2507     virtual ~ComputeRandomProgram() = default;
2508 
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)2509     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
2510     {
2511         DE_ASSERT(false);
2512         // Do not use this method, to simulate generated program use simulate2 instead
2513         DE_UNREF(countOnly);
2514         DE_UNREF(subgroupSize);
2515         DE_UNREF(ref);
2516         return 0;
2517     }
2518 
2519     struct ComputePrerequisites : Prerequisites
2520     {
2521         const uint32_t subgroupSize;
2522         const uint32_t subgroupCount;
2523         const Ballot subgroupSizeMask;
2524         std::vector<std::pair<bool, tcu::UVec4>> ballots;
ComputePrerequisitesvkt::Reconvergence::__anon4f2394780111::ComputeRandomProgram::ComputePrerequisites2525         ComputePrerequisites(uint32_t subgroupSize_, uint32_t subgroupCount_)
2526             : subgroupSize(subgroupSize_)
2527             , subgroupCount(subgroupCount_)
2528             , subgroupSizeMask(subgroupSizeToMask(subgroupSize, subgroupCount))
2529             , ballots(subgroupCount_)
2530         {
2531         }
2532     };
2533 
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)2534     virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
2535                              bool endWithSemicolon = false) override
2536     {
2537         printIndent(css);
2538 
2539         css << "outputC.loc[gl_LocalInvocationIndex]++,";
2540         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
2541         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
2542         // subgroup_uniform_control_flow, since we only validate results that must be fully
2543         // reconverged.
2544         if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
2545         {
2546             css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = " << getPartitionBallotText();
2547         }
2548         else if (caseDef.isElect())
2549         {
2550             css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex].x = elect()";
2551         }
2552         else
2553         {
2554             css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = subgroupBallot(true)";
2555         }
2556         if (endWithSemicolon)
2557         {
2558             css << ";\n";
2559         }
2560     }
2561 
2562 protected:
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2563     virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
2564                                 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
2565                                 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2566                                 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2567                                 const OPType reason, const tcu::UVec4 *cmp) override
2568     {
2569         DE_UNREF(unusedPrimitiveID);
2570         DE_UNREF(log);
2571         DE_UNREF(logFailureCount);
2572         DE_UNREF(reason);
2573         DE_UNREF(cmp);
2574         auto pre                     = static_pointer_cast<ComputePrerequisites>(prerequisites);
2575         const uint32_t subgroupCount = activeMask.subgroupCount();
2576         const uint32_t subgroupSize  = pre->subgroupSize;
2577 
2578         std::fill_n(pre->ballots.begin(), subgroupCount, std::pair<bool, tcu::UVec4>());
2579 
2580         for (uint32_t id = 0; id < invocationStride; ++id)
2581         {
2582             if (activeMask.test((Ballots::findBit(id, subgroupSize))))
2583             {
2584                 if (countOnly)
2585                 {
2586                     outLoc[id]++;
2587                 }
2588                 else
2589                 {
2590                     if (ops[opsIndex].caseValue)
2591                     {
2592                         // Emit a magic value to indicate that we shouldn't validate this ballot
2593                         ref[(outLoc[id]++) * invocationStride + id] =
2594                             bitsetToBallot(0x12345678, subgroupCount, subgroupSize, id);
2595                     }
2596                     else
2597                     {
2598                         add_ref<std::pair<bool, tcu::UVec4>> info(pre->ballots.at(id / subgroupSize));
2599                         if (false == info.first)
2600                         {
2601                             info.first  = true;
2602                             info.second = bitsetToBallot(activeMask, pre->subgroupSizeMask, subgroupSize, id);
2603                         }
2604                         ref[(outLoc[id]++) * invocationStride + id] = info.second;
2605                     }
2606                 }
2607             }
2608         }
2609     }
2610 
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2611     virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
2612                                const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
2613                                add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2614                                std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2615                                const OPType reason, const tcu::UVec4 *cmp) override
2616     {
2617         DE_UNREF(unusedPrimitiveID);
2618         DE_UNREF(log);
2619         DE_UNREF(logFailureCount);
2620         DE_UNREF(reason);
2621         DE_UNREF(cmp);
2622         const uint32_t subgroupSize = static_pointer_cast<ComputePrerequisites>(prerequisites)->subgroupSize;
2623         for (uint32_t id = 0; id < invocationStride; ++id)
2624         {
2625             if (activeMask.test(Ballots::findBit(id, subgroupSize)))
2626             {
2627                 if (countOnly)
2628                     outLoc[id]++;
2629                 else
2630                     ref[(outLoc[id]++) * invocationStride + id][0] = uint32_t(storeValue & 0xFFFFFFFF);
2631             }
2632         }
2633     }
2634 
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)2635     virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
2636                                                              const uint32_t subgroupSize, const uint32_t fragmentStride,
2637                                                              const uint32_t primitiveStride,
2638                                                              add_ref<std::vector<SubgroupState2>> stateStack,
2639                                                              add_ref<std::vector<uint32_t>> outLoc,
2640                                                              add_ref<uint32_t> subgroupCount) override
2641     {
2642         DE_UNREF(outputP);
2643         DE_UNREF(fragmentStride);
2644         DE_ASSERT(invocationStride == primitiveStride);
2645         subgroupCount      = ROUNDUP(invocationStride, subgroupSize) / subgroupSize;
2646         auto prerequisites = std::make_shared<ComputePrerequisites>(subgroupSize, subgroupCount);
2647         stateStack.resize(10u, SubgroupState2(subgroupCount));
2648         outLoc.resize(primitiveStride, 0u);
2649         add_ref<Ballots> activeMask(stateStack.at(0).activeMask);
2650         for (uint32_t id = 0; id < invocationStride; ++id)
2651         {
2652             activeMask.set(Ballots::findBit(id, subgroupSize));
2653         }
2654         return prerequisites;
2655     }
2656 };
2657 
2658 class FragmentRandomProgram : public RandomProgram
2659 {
2660 public:
2661 #define BALLOT_STACK_SIZE_DEFVAL_LINE (__LINE__ + 1)
2662     static constexpr const uint32_t experimentalOutLocSize      = 16384;
2663     static constexpr const uint32_t conditionIfInvocationStride = 511u;
FragmentRandomProgram(const CaseDef & c)2664     FragmentRandomProgram(const CaseDef &c) : RandomProgram(c, conditionIfInvocationStride)
2665     {
2666         DE_ASSERT(caseDef.testType == TT_MAXIMAL);
2667         DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_FRAGMENT_BIT);
2668     }
2669     virtual ~FragmentRandomProgram() = default;
2670 
create(const CaseDef & c)2671     static de::MovePtr<FragmentRandomProgram> create(const CaseDef &c)
2672     {
2673         return de::MovePtr<FragmentRandomProgram>(new FragmentRandomProgram(c));
2674     }
2675 
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)2676     virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
2677     {
2678         printIndent(css);
2679         css << "if (invocationIndex() >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
2680     }
2681 
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)2682     virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
2683     {
2684         printIndent(css);
2685         css << "storeValue(outLoc++, 0x" << std::hex << flow.ops[flow.opsIndex].value << ");\n";
2686     }
2687 
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWidthSemicolon=false)2688     virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
2689                              bool endWidthSemicolon = false) override
2690     {
2691         printIndent(css);
2692         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
2693         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
2694         // subgroup_uniform_control_flow, since we only validate results that must be fully
2695         // reconverged.
2696         if (loopNesting > 0)
2697         {
2698             css << "storeBallot(outLoc++)";
2699         }
2700         else
2701         {
2702             css << getPartitionBallotText();
2703         }
2704         if (endWidthSemicolon)
2705         {
2706             css << ";\n";
2707         }
2708     }
2709 
getPartitionBallotText()2710     virtual std::string getPartitionBallotText() override
2711     {
2712         return "storeBallot(outLoc++)";
2713     }
2714 
genIf(IFType ifType,uint32_t maxLocalIndexCmp=0u)2715     virtual void genIf(IFType ifType, uint32_t maxLocalIndexCmp = 0u) override
2716     {
2717         DE_UNREF(maxLocalIndexCmp);
2718         RandomProgram::genIf(ifType, conditionIfInvocationStride);
2719     }
2720 
2721     struct Arrangement : Prerequisites, ReconvergenceTestFragmentInstance::Arrangement
2722     {
2723         const uint32_t m_width;
2724         const uint32_t m_height;
2725         const uint32_t m_subgroupSize;
2726         const uint32_t m_fragmentStride;
2727         const uint32_t m_primitiveStride;
2728         const uint32_t m_subgroupCount;
2729         const Ballots m_initialBallots;
2730         const Ballots m_nonHelperInitialBallots;
2731         const uint32_t m_invocationStride;
2732         const std::vector<std::vector<uint32_t>> m_fragmentSubgroups;
Arrangementvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2733         Arrangement(add_cref<std::vector<uint32_t>> info, uint32_t width, uint32_t height, uint32_t subgroupSize,
2734                     uint32_t primitiveStride)
2735             : m_width(width)
2736             , m_height(height)
2737             , m_subgroupSize(subgroupSize)
2738             , m_fragmentStride(width * height)
2739             , m_primitiveStride(primitiveStride)
2740             , m_subgroupCount(calcSubgroupCount(info, primitiveStride, m_fragmentStride))
2741             , m_initialBallots(makeInitialBallots(info, primitiveStride, m_fragmentStride, false))
2742             , m_nonHelperInitialBallots(makeInitialBallots(info, primitiveStride, m_fragmentStride, true))
2743             , m_invocationStride(calcInvocationStride(info, subgroupSize, primitiveStride, m_fragmentStride))
2744             , m_fragmentSubgroups(makeFragmentSubgroups(info, subgroupSize, primitiveStride, m_fragmentStride))
2745         {
2746         }
calcSubgroupCountvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2747         static uint32_t calcSubgroupCount(add_cref<std::vector<uint32_t>> info, const uint32_t primitiveStride,
2748                                           const uint32_t fragmentStride)
2749         {
2750             const uint32_t cc = fragmentStride * primitiveStride;
2751             std::set<uint32_t> s;
2752             uint32_t subgroupID;
2753             uint32_t subgroupInvocationID;
2754             uint32_t isHelperInvocation;
2755             for (uint32_t c = 0u; c < cc; ++c)
2756             {
2757                 if (validID(info.at(c), subgroupID, subgroupInvocationID, isHelperInvocation))
2758                     s.insert(subgroupID);
2759             }
2760             const uint32_t gMin = *s.begin();
2761             DE_UNREF(gMin);
2762             const uint32_t gMax = *std::next(s.begin(), (s.size() - 1u));
2763             DE_UNREF(gMax);
2764             DE_ASSERT(gMin == 0u);
2765             DE_ASSERT(gMax == (s.size() - 1u));
2766             return static_cast<uint32_t>(s.size());
2767         }
calcInvocationStridevkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2768         static uint32_t calcInvocationStride(add_cref<std::vector<uint32_t>> info, const uint32_t subgroupSize,
2769                                              const uint32_t primitiveStride, const uint32_t fragmentStride)
2770         {
2771             return calcSubgroupCount(info, fragmentStride, primitiveStride) * subgroupSize;
2772         }
makeInitialBallotsvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2773         static Ballots makeInitialBallots(add_cref<std::vector<uint32_t>> info, const uint32_t primitiveStride,
2774                                           const uint32_t fragmentStride, bool excludeHelpers)
2775         {
2776             uint32_t subgroupID;
2777             uint32_t subgroupInvocationID;
2778             uint32_t isHelperInvocation;
2779             Ballots b(calcSubgroupCount(info, fragmentStride, primitiveStride));
2780             const uint32_t cc = fragmentStride * primitiveStride;
2781             for (uint32_t c = 0u; c < cc; ++c)
2782             {
2783                 if (validID(info.at(c), subgroupID, subgroupInvocationID, isHelperInvocation))
2784                 {
2785                     if (!(excludeHelpers && (isHelperInvocation != 0)))
2786                         b.at(subgroupID).set(subgroupInvocationID);
2787                 }
2788             }
2789             return b;
2790         }
2791         // Fully Qualified Invocation Name
fqinvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2792         static uint32_t fqin(uint32_t maybeHelperFragmentFQIN, add_ref<uint32_t> isHelperInvocation)
2793         {
2794             isHelperInvocation = maybeHelperFragmentFQIN >> 31;
2795             return (maybeHelperFragmentFQIN & 0x7FFFFFFF);
2796         }
makeFragmentSubgroupsvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2797         static auto makeFragmentSubgroups(add_cref<std::vector<uint32_t>> info, const uint32_t subgroupSize,
2798                                           const uint32_t primitiveStride, const uint32_t fragmentStride)
2799             -> std::vector<std::vector<uint32_t>>
2800         {
2801             const uint32_t subgroupCount = calcSubgroupCount(info, fragmentStride, primitiveStride);
2802             std::vector<std::vector<uint32_t>> map(primitiveStride);
2803             for (uint32_t p = 0u; p < primitiveStride; ++p)
2804                 map[p].resize(fragmentStride, (subgroupCount * subgroupSize));
2805 
2806             uint32_t subgroupID;
2807             uint32_t subgroupInvocationID;
2808             uint32_t isHelperInvocation;
2809             for (uint32_t p = 0u; p < primitiveStride; ++p)
2810                 for (uint32_t f = 0u; f < fragmentStride; ++f)
2811                 {
2812                     const uint32_t sgid = info.at(f * primitiveStride + p);
2813                     if (validID(sgid, subgroupID, subgroupInvocationID, isHelperInvocation))
2814                         map.at(p).at(f) =
2815                             (subgroupID * subgroupSize + subgroupInvocationID) | (isHelperInvocation << 31);
2816                 }
2817             return map;
2818         }
calcRealInvocationCountvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2819         static uint32_t calcRealInvocationCount(add_cref<std::vector<uint32_t>> info, uint32_t primitiveStride,
2820                                                 uint32_t fragmentStride)
2821         {
2822             const uint32_t cc = fragmentStride * primitiveStride;
2823             uint32_t n        = 0u;
2824             for (uint32_t c = 0u; c < cc; ++c)
2825             {
2826                 if (info[c])
2827                     ++n;
2828             }
2829             return n;
2830         }
2831 
2832     private:
validIDvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2833         static bool validID(const uint32_t id)
2834         {
2835             uint32_t subgroupID;
2836             DE_UNREF(subgroupID);
2837             uint32_t subgroupInvocationID;
2838             DE_UNREF(subgroupInvocationID);
2839             uint32_t isHelperInvocation;
2840             DE_UNREF(isHelperInvocation);
2841             return validID(id, subgroupID, subgroupInvocationID, isHelperInvocation);
2842         }
validIDvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2843         static bool validID(const uint32_t id, add_ref<uint32_t> subgroupID, add_ref<uint32_t> subgroupInvocationID,
2844                             add_ref<uint32_t> isHelperInvocation)
2845         {
2846             if (id != 0u)
2847             {
2848                 subgroupInvocationID = (id & 0xFFFF);
2849                 subgroupID           = ((id >> 16) & 0x7FFF) - 1u;
2850                 isHelperInvocation   = (id >> 31);
2851                 return true;
2852             }
2853             return false;
2854         }
2855     };
2856 
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)2857     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
2858     {
2859         DE_ASSERT(false); // use overloaded version of simulate() instead
2860         DE_UNREF(countOnly);
2861         DE_UNREF(subgroupSize);
2862         DE_UNREF(ref);
2863         return 0;
2864     }
2865 
2866     // Simulate execution of the program. If countOnly is true, just return
2867     // the max number of outputs written. If it's false, store out the result
2868     // values to ref.
execute(qpWatchDog * watchDog,bool countOnly,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,add_cref<std::vector<uint32_t>> outputP,const tcu::UVec4 * cmp=nullptr,const uint32_t reserved=(~0u))2869     virtual uint32_t execute(qpWatchDog *watchDog, bool countOnly, const uint32_t subgroupSize,
2870                              const uint32_t fragmentStride, const uint32_t primitiveStride,
2871                              add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2872                              add_cref<std::vector<uint32_t>> outputP, const tcu::UVec4 *cmp = nullptr,
2873                              const uint32_t reserved = (~0u)) override
2874     {
2875         DE_UNREF(reserved);
2876         uint32_t outLocs    = 0u;
2877         uint32_t maxOutLocs = 0u;
2878         for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
2879         {
2880             outLocs    = RandomProgram::execute(watchDog, countOnly, subgroupSize, fragmentStride, primitiveStride, ref,
2881                                                 log, outputP, cmp, primitiveID);
2882             maxOutLocs = std::max(outLocs, maxOutLocs);
2883         }
2884         return maxOutLocs;
2885     }
2886 
2887 protected:
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2888     virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2889                                const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
2890                                add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2891                                std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2892                                const OPType reason, const tcu::UVec4 *cmp) override
2893     {
2894         uint32_t isHelperInvocation;
2895         add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
2896         for (const uint32_t id : a.m_fragmentSubgroups.at(primitiveID))
2897         {
2898             const uint32_t sgid = a.fqin(id, isHelperInvocation);
2899             if (sgid >= (a.m_subgroupCount * a.m_subgroupSize))
2900                 continue;
2901             if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
2902                 continue;
2903             const uint32_t loc   = primitiveID * a.m_subgroupCount * 128 + sgid;
2904             const uint32_t index = ((outLoc.at(loc)++) * (a.m_primitiveStride * a.m_subgroupCount * 128) +
2905                                     (primitiveID * a.m_subgroupCount * 128) + sgid);
2906             if (false == countOnly)
2907             {
2908                 ref.at(index) = tcu::UVec4(uint32_t(storeValue & 0xFFFFFFFF), 0u, 0u, 0u);
2909                 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
2910                 {
2911                     logFailureCount -= 1u;
2912                     log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
2913                         << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
2914                 }
2915             }
2916         }
2917     }
2918 
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2919     virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2920                                 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
2921                                 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2922                                 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2923                                 const OPType reason, const tcu::UVec4 *cmp) override
2924     {
2925         DE_UNREF(opsIndex);
2926         uint32_t isHelperInvocation;
2927         add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
2928         for (const uint32_t id : a.m_fragmentSubgroups.at(primitiveID))
2929         {
2930             const uint32_t sgid = a.fqin(id, isHelperInvocation);
2931             if (sgid >= (a.m_subgroupCount * a.m_subgroupSize))
2932                 continue;
2933             if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
2934                 continue;
2935             const uint32_t loc   = primitiveID * a.m_subgroupCount * 128 + sgid;
2936             const uint32_t index = ((outLoc.at(loc)++) * (a.m_primitiveStride * a.m_subgroupCount * 128) +
2937                                     (primitiveID * a.m_subgroupCount * 128) + sgid);
2938             if (false == countOnly)
2939             {
2940                 ref.at(index) = Ballot(activeMask.at(sgid / a.m_subgroupSize));
2941                 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
2942                 {
2943                     logFailureCount -= 1u;
2944                     log << tcu::TestLog::Message << logFailureCount << ": ballot mismatch from " << OPtypeToStr(reason)
2945                         << tcu::TestLog::EndMessage;
2946                 }
2947             }
2948         }
2949     }
2950 
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)2951     virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
2952                                                              const uint32_t subgroupSize, const uint32_t fragmentStride,
2953                                                              const uint32_t primitiveStride,
2954                                                              add_ref<std::vector<SubgroupState2>> stateStack,
2955                                                              add_ref<std::vector<uint32_t>> outLoc,
2956                                                              add_ref<uint32_t> subgroupCount) override
2957     {
2958         auto prerequisites = std::make_shared<Arrangement>(outputP, fragmentStride, 1u, subgroupSize, primitiveStride);
2959         subgroupCount      = prerequisites->m_subgroupCount;
2960         stateStack.resize(10u, SubgroupState2(subgroupCount));
2961         outLoc.resize((subgroupCount * 128u * fragmentStride), 0u);
2962         stateStack.at(0).activeMask = prerequisites->m_initialBallots;
2963         return prerequisites;
2964     }
2965 };
2966 
2967 class VertexRandomProgram : public RandomProgram
2968 {
2969 public:
2970     static const constexpr uint32_t fillPercentage = 73u;
VertexRandomProgram(add_cref<CaseDef> c)2971     VertexRandomProgram(add_cref<CaseDef> c)
2972         : RandomProgram(c,
2973                         static_cast<uint32_t>(Arrangement::generatePrimitives(c.sizeX, c.sizeY, fillPercentage).size()))
2974     {
2975         DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_VERTEX_BIT);
2976     }
2977     virtual ~VertexRandomProgram() = default;
2978 
2979     struct Arrangement : Prerequisites
2980     {
2981         static constexpr uint32_t NUM_SUBGROUPS_OFFSET      = 0u;
2982         static constexpr uint32_t SUBGROUP_SIZE_OFFSET      = 1u;
2983         static constexpr uint32_t INVOCATION_COUNT_OFFSET   = 2u;
2984         static constexpr uint32_t INVOCATION_ENTRIES_OFFSET = 3u;
2985 
2986         const uint32_t m_subgroupSize;
2987         const uint32_t m_primitiveStride;
2988         const uint32_t m_subgroupCount;
2989         const Ballots m_initialBallots;
2990         const uint32_t m_invocationStride;
2991         const std::vector<uint32_t> m_primitiveSubgroups;
Arrangementvkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement2992         Arrangement(add_cref<std::vector<uint32_t>> outputP, uint32_t subgroupSize, uint32_t primitiveStride)
2993             : m_subgroupSize(subgroupSize)
2994             , m_primitiveStride(primitiveStride)
2995             , m_subgroupCount(calcSubgroupCount(outputP))
2996             , m_initialBallots(makeInitialBallots(subgroupSize, primitiveStride, outputP))
2997             , m_invocationStride(primitiveStride)
2998             , m_primitiveSubgroups(makePrimitiveSubgroups(subgroupSize, primitiveStride, outputP))
2999         {
3000         }
calcSubgroupCountvkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3001         static uint32_t calcSubgroupCount(add_cref<std::vector<uint32_t>> outputP)
3002         {
3003             return outputP.at(NUM_SUBGROUPS_OFFSET);
3004         }
calcSubgroupSizevkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3005         static uint32_t calcSubgroupSize(add_cref<std::vector<uint32_t>> outputP)
3006         {
3007             return outputP.at(SUBGROUP_SIZE_OFFSET);
3008         }
calcSubgroupInvocationStridevkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3009         static uint32_t calcSubgroupInvocationStride(add_cref<std::vector<uint32_t>> outputP)
3010         {
3011             return outputP.at(INVOCATION_COUNT_OFFSET);
3012         }
makeInitialBallotsvkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3013         static Ballots makeInitialBallots(uint32_t subgroupSize, uint32_t primitiveStride,
3014                                           add_cref<std::vector<uint32_t>> outputP)
3015         {
3016             DE_UNREF(subgroupSize);
3017             const uint32_t subgroupCount = calcSubgroupCount(outputP);
3018             Ballots initialBallots(subgroupCount);
3019             for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
3020             {
3021                 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRIES_OFFSET);
3022                 if (id)
3023                 {
3024                     const uint32_t subgroupID           = (id >> 16) - 1u;
3025                     const uint32_t subgroupInvocationID = id & 0xFFFF;
3026                     DE_ASSERT(subgroupID < subgroupCount);
3027                     DE_ASSERT(subgroupInvocationID < subgroupSize);
3028                     initialBallots.at(subgroupID).set(subgroupInvocationID);
3029                 }
3030             }
3031             return initialBallots;
3032         }
makePrimitiveSubgroupsvkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3033         static std::vector<uint32_t> makePrimitiveSubgroups(uint32_t subgroupSize, uint32_t primitiveStride,
3034                                                             add_cref<std::vector<uint32_t>> outputP)
3035         {
3036             std::vector<uint32_t> map(primitiveStride);
3037             for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
3038             {
3039                 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRIES_OFFSET);
3040                 if (id)
3041                 {
3042                     const uint32_t subgroupID           = (id >> 16) - 1u;
3043                     const uint32_t subgroupInvocationID = id & 0xFFFF;
3044                     DE_ASSERT(subgroupInvocationID < subgroupSize);
3045                     map.at(primitiveID) = subgroupID * subgroupSize + subgroupInvocationID;
3046                 }
3047             }
3048             return map;
3049         }
generatePrimitivesvkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3050         static std::vector<tcu::Vec4> generatePrimitives(uint32_t width, uint32_t height, uint32_t fillPercent)
3051         {
3052             deRandom rnd;
3053             std::map<uint32_t, int> map;
3054             std::vector<tcu::Vec4> points;
3055             const uint32_t frags = (width * height);
3056             const uint32_t total = (frags * fillPercent) / 100u;
3057 
3058             deRandom_init(&rnd, (width * height));
3059 
3060             for (uint32_t i = 0u; i < total; ++i)
3061             {
3062                 const uint32_t r = deRandom_getUint32(&rnd) % frags;
3063                 if (map[r] != 0)
3064                 {
3065                     i -= 1;
3066                     continue;
3067                 }
3068                 map[r] = 1;
3069 
3070                 uint32_t y = r / width;
3071                 uint32_t x = r % width;
3072                 float xx   = (float(x) + float(x + 1)) / (2.0f * float(width));
3073                 float yy   = (float(y) + float(y + 1)) / (2.0f * float(height));
3074                 float xxx  = xx * 2.0f - 1.0f;
3075                 float yyy  = yy * 2.0f - 1.0f;
3076                 points.emplace_back(tcu::Vec4(xxx, yyy, 0u, 0u));
3077             }
3078             return points;
3079         }
generateOutputPvectorvkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3080         static std::vector<uint32_t> generateOutputPvector(uint32_t subgroupSize, uint32_t vertexCount)
3081         {
3082             const uint32_t subgroupCount = ROUNDUP(vertexCount, subgroupSize) / subgroupSize;
3083             std::vector<uint32_t> outputP(vertexCount + INVOCATION_ENTRIES_OFFSET);
3084             outputP.at(NUM_SUBGROUPS_OFFSET)    = subgroupCount;
3085             outputP.at(SUBGROUP_SIZE_OFFSET)    = subgroupSize;
3086             outputP.at(INVOCATION_COUNT_OFFSET) = vertexCount;
3087             for (uint32_t vertexID = 0u; vertexID < vertexCount; ++vertexID)
3088             {
3089                 const uint32_t subgroupID                        = vertexID / subgroupSize;
3090                 const uint32_t subgroupInvocationID              = vertexID % subgroupSize;
3091                 outputP.at(vertexID + INVOCATION_ENTRIES_OFFSET) = ((subgroupID + 1u) << 16) | subgroupInvocationID;
3092             }
3093             return outputP;
3094         }
3095     };
3096 
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)3097     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
3098     {
3099         DE_ASSERT(false); // use overloaded version of simulate() instead
3100         DE_UNREF(countOnly);
3101         DE_UNREF(subgroupSize);
3102         DE_UNREF(ref);
3103         return 0;
3104     }
3105 
3106 protected:
genIf(IFType ifType,uint32_t)3107     virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
3108     {
3109         RandomProgram::genIf(ifType, RandomProgram::invocationStride);
3110     }
3111 
getPartitionBallotText()3112     virtual std::string getPartitionBallotText() override
3113     {
3114         return "storeValue(outLoc++, subgroupBallot(true))";
3115     }
3116 
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)3117     virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3118     {
3119         printIndent(css);
3120         css << "if (invocationIndex() >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
3121     }
3122 
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)3123     virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3124     {
3125         printIndent(css);
3126         css << "storeValue(outLoc++, 0x" << std::hex << flow.ops[flow.opsIndex].value << std::dec << ");\n";
3127     }
3128 
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)3129     virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
3130                              bool endWithSemicolon = false) override
3131     {
3132         printIndent(css);
3133         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
3134         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
3135         // subgroup_uniform_control_flow, since we only validate results that must be fully
3136         // reconverged.
3137         if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
3138         {
3139             css << getPartitionBallotText();
3140         }
3141         else
3142         {
3143             css << "storeValue(outLoc++, subgroupBallot(true))";
3144         }
3145         if (endWithSemicolon)
3146         {
3147             css << ";\n";
3148         }
3149     }
3150 
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)3151     virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
3152                                 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
3153                                 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
3154                                 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
3155                                 const OPType reason, const tcu::UVec4 *cmp) override
3156     {
3157         DE_UNREF(unusedPrimitiveID);
3158         DE_UNREF(opsIndex);
3159         add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
3160         for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
3161         {
3162             const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
3163             DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
3164             if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
3165                 continue;
3166             const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
3167             if (false == countOnly)
3168             {
3169                 ref.at(index) = Ballot(activeMask.at(sgid / a.m_subgroupSize));
3170                 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
3171                 {
3172                     logFailureCount -= 1u;
3173                     log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
3174                         << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
3175                 }
3176             }
3177         }
3178     }
3179 
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)3180     virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
3181                                const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
3182                                add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
3183                                std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
3184                                const OPType reason, const tcu::UVec4 *cmp) override
3185     {
3186         DE_UNREF(unusedPrimitiveID);
3187         add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
3188         for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
3189         {
3190             const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
3191             DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
3192             if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
3193                 continue;
3194             const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
3195             if (false == countOnly)
3196             {
3197                 ref.at(index) = Ballot(tcu::UVec4(uint32_t(storeValue & 0xFFFFFFFF), 0u, 0u, 0u));
3198                 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
3199                 {
3200                     logFailureCount -= 1u;
3201                     log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
3202                         << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
3203                 }
3204             }
3205         }
3206     }
3207 
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)3208     virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
3209                                                              const uint32_t subgroupSize, const uint32_t fragmentStride,
3210                                                              const uint32_t primitiveStride,
3211                                                              add_ref<std::vector<SubgroupState2>> stateStack,
3212                                                              add_ref<std::vector<uint32_t>> outLoc,
3213                                                              add_ref<uint32_t> subgroupCount) override
3214     {
3215         DE_UNREF(fragmentStride);
3216         auto prerequisites = std::make_shared<Arrangement>(outputP, subgroupSize, primitiveStride);
3217         subgroupCount      = prerequisites->m_subgroupCount;
3218         stateStack.resize(10u, SubgroupState2(subgroupCount));
3219         outLoc.resize(primitiveStride, 0u);
3220         stateStack.at(0).activeMask = prerequisites->m_initialBallots;
3221         return prerequisites;
3222     }
3223 };
3224 
3225 class TessCtrlRandomProgram : public RandomProgram
3226 {
3227 public:
TessCtrlRandomProgram(add_cref<CaseDef> c,uint32_t invocationCount)3228     TessCtrlRandomProgram(add_cref<CaseDef> c, uint32_t invocationCount) : RandomProgram(c, invocationCount)
3229     {
3230         DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT);
3231     }
3232     virtual ~TessCtrlRandomProgram() = default;
3233 
3234     static const uint32_t minSubgroupSize = 4;
3235 
genIf(IFType ifType,uint32_t)3236     virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
3237     {
3238         RandomProgram::genIf(ifType, std::min((minSubgroupSize * caseDef.sizeX), 64u));
3239     }
3240 
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)3241     virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3242     {
3243         printIndent(css);
3244         css << "if (";
3245         css << "((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) + gl_SubgroupInvocationID)";
3246         css << " >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
3247     }
3248 
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)3249     virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3250     {
3251         printIndent(css);
3252         css << "outputC.loc[invocationIndex()]++;\n";
3253         printIndent(css);
3254         css << "outputB.b[(outLoc++) * invocationStride + invocationIndex()].x = 0x" << std::hex
3255             << flow.ops[flow.opsIndex].value << ";\n";
3256     }
3257 
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)3258     virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
3259                              bool endWithSemicolon = false) override
3260     {
3261         printIndent(css);
3262 
3263         css << "outputC.loc[invocationIndex()]++,";
3264         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
3265         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
3266         // subgroup_uniform_control_flow, since we only validate results that must be fully
3267         // reconverged.
3268         if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
3269         {
3270             css << "outputB.b[(outLoc++) * invocationStride + invocationIndex()] = " << getPartitionBallotText()
3271                 << ".xy";
3272         }
3273         else
3274         {
3275             css << "outputB.b[(outLoc++) * invocationStride + invocationIndex()] = subgroupBallot(true).xy";
3276         }
3277         if (endWithSemicolon)
3278         {
3279             css << ";\n";
3280         }
3281     }
3282 
simulateStoreToChange(bool countOnly,uint32_t,const SubgroupState (& stateStack)[10],int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)3283     void simulateStoreToChange(bool countOnly, uint32_t /*subgroupSize*/, const SubgroupState (&stateStack)[10],
3284                                int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
3285                                add_ref<std::vector<uint64_t>> ref)
3286     {
3287         for (uint32_t id = 0; id < invocationStride; ++id)
3288         {
3289             if (stateStack[nesting].activeMask.test(id))
3290             {
3291                 if (countOnly)
3292                     outLoc[id]++;
3293                 else
3294                     ref[(outLoc[id]++) * invocationStride + id] = ops[opsIndex].value;
3295             }
3296         }
3297     }
3298 
simulateBallotToChange(bool countOnly,uint32_t subgroupSize,const SubgroupState (& stateStack)[10],uint32_t,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)3299     void simulateBallotToChange(bool countOnly, uint32_t subgroupSize, const SubgroupState (&stateStack)[10],
3300                                 uint32_t /*opsIndex*/, add_ref<std::vector<uint32_t>> outLoc,
3301                                 add_ref<std::vector<uint64_t>> ref)
3302     {
3303         for (uint32_t id = 0; id < invocationStride; ++id)
3304         {
3305             if (stateStack[nesting].activeMask.test(id))
3306             {
3307                 if (countOnly)
3308                     outLoc[id]++;
3309                 else
3310                     ref[(outLoc[id]++) * invocationStride + id] =
3311                         bitsetToU64(stateStack[nesting].activeMask, subgroupSize, id);
3312             }
3313         }
3314     }
3315 
3316     // Simulate execution of the program. If countOnly is true, just return
3317     // the max number of outputs written. If it's false, store out the result
3318     // values to ref.
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)3319     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
3320     {
3321         SubgroupState stateStack[10];
3322         deMemset(&stateStack, 0, sizeof(stateStack));
3323 
3324         // Per-invocation output location counters
3325         std::vector<uint32_t> outLoc(invocationStride, 0u);
3326 
3327         nesting     = 0;
3328         loopNesting = 0;
3329 
3330         for (uint32_t k = 0; k < invocationStride; ++k)
3331             stateStack[nesting].activeMask.set(k);
3332 
3333         int32_t i = 0;
3334         while (i < (int32_t)ops.size())
3335         {
3336             switch (ops[i].type)
3337             {
3338             case OP_BALLOT:
3339                 simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3340                 break;
3341             case OP_STORE:
3342                 simulateStoreToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3343                 break;
3344             case OP_IF_MASK:
3345                 nesting++;
3346                 stateStack[nesting].activeMask =
3347                     stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3348                 stateStack[nesting].header   = i;
3349                 stateStack[nesting].isLoop   = 0;
3350                 stateStack[nesting].isSwitch = 0;
3351                 break;
3352             case OP_ELSE_MASK:
3353                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3354                                                  ~bitsetFromU64(ops[stateStack[nesting].header].value, subgroupSize);
3355                 break;
3356             case OP_IF_LOOPCOUNT:
3357             {
3358                 uint32_t n = nesting;
3359                 while (!stateStack[n].isLoop)
3360                     n--;
3361 
3362                 nesting++;
3363                 stateStack[nesting].activeMask =
3364                     stateStack[nesting - 1].activeMask & bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3365                 stateStack[nesting].header   = i;
3366                 stateStack[nesting].isLoop   = 0;
3367                 stateStack[nesting].isSwitch = 0;
3368                 break;
3369             }
3370             case OP_ELSE_LOOPCOUNT:
3371             {
3372                 uint32_t n = nesting;
3373                 while (!stateStack[n].isLoop)
3374                     n--;
3375 
3376                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3377                                                  ~bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3378                 break;
3379             }
3380             case OP_IF_LOCAL_INVOCATION_INDEX: // TessCtrlRandomProgram
3381             {
3382                 // all bits >= N
3383                 bitset_inv_t mask;
3384                 for (uint32_t j = static_cast<uint32_t>(ops[i].value); j < invocationStride; ++j)
3385                     mask.set(j);
3386 
3387                 nesting++;
3388                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3389                 stateStack[nesting].header     = i;
3390                 stateStack[nesting].isLoop     = 0;
3391                 stateStack[nesting].isSwitch   = 0;
3392                 break;
3393             }
3394             case OP_ELSE_LOCAL_INVOCATION_INDEX: // TessCtrlRandomProgram
3395             {
3396                 // all bits < N
3397                 bitset_inv_t mask;
3398                 for (uint32_t j = 0; j < static_cast<uint32_t>(ops[i].value); ++j)
3399                     mask.set(j);
3400 
3401                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3402                 break;
3403             }
3404             case OP_ENDIF:
3405                 nesting--;
3406                 break;
3407             case OP_BEGIN_FOR_UNIF:
3408                 // XXX TODO: We don't handle a for loop with zero iterations
3409                 nesting++;
3410                 loopNesting++;
3411                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3412                 stateStack[nesting].header       = i;
3413                 stateStack[nesting].tripCount    = 0;
3414                 stateStack[nesting].isLoop       = 1;
3415                 stateStack[nesting].isSwitch     = 0;
3416                 stateStack[nesting].continueMask = 0;
3417                 break;
3418             case OP_END_FOR_UNIF:
3419                 stateStack[nesting].tripCount++;
3420                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3421                 stateStack[nesting].continueMask = 0;
3422                 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3423                     stateStack[nesting].activeMask.any())
3424                 {
3425                     i = stateStack[nesting].header + 1;
3426                     continue;
3427                 }
3428                 else
3429                 {
3430                     loopNesting--;
3431                     nesting--;
3432                 }
3433                 break;
3434             case OP_BEGIN_DO_WHILE_UNIF:
3435                 // XXX TODO: We don't handle a for loop with zero iterations
3436                 nesting++;
3437                 loopNesting++;
3438                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3439                 stateStack[nesting].header       = i;
3440                 stateStack[nesting].tripCount    = 1;
3441                 stateStack[nesting].isLoop       = 1;
3442                 stateStack[nesting].isSwitch     = 0;
3443                 stateStack[nesting].continueMask = 0;
3444                 break;
3445             case OP_END_DO_WHILE_UNIF:
3446                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3447                 stateStack[nesting].continueMask = 0;
3448                 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3449                     stateStack[nesting].activeMask.any())
3450                 {
3451                     i = stateStack[nesting].header + 1;
3452                     stateStack[nesting].tripCount++;
3453                     continue;
3454                 }
3455                 else
3456                 {
3457                     loopNesting--;
3458                     nesting--;
3459                 }
3460                 break;
3461             case OP_BEGIN_FOR_VAR:
3462                 // XXX TODO: We don't handle a for loop with zero iterations
3463                 nesting++;
3464                 loopNesting++;
3465                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3466                 stateStack[nesting].header       = i;
3467                 stateStack[nesting].tripCount    = 0;
3468                 stateStack[nesting].isLoop       = 1;
3469                 stateStack[nesting].isSwitch     = 0;
3470                 stateStack[nesting].continueMask = 0;
3471                 break;
3472             case OP_END_FOR_VAR:
3473                 stateStack[nesting].tripCount++;
3474                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3475                 stateStack[nesting].continueMask = 0;
3476                 stateStack[nesting].activeMask &= bitsetFromU64(stateStack[nesting].tripCount == subgroupSize ?
3477                                                                     0 :
3478                                                                     ~((1ULL << (stateStack[nesting].tripCount)) - 1),
3479                                                                 subgroupSize);
3480                 if (stateStack[nesting].activeMask.any())
3481                 {
3482                     i = stateStack[nesting].header + 1;
3483                     continue;
3484                 }
3485                 else
3486                 {
3487                     loopNesting--;
3488                     nesting--;
3489                 }
3490                 break;
3491             case OP_BEGIN_FOR_INF:
3492             case OP_BEGIN_DO_WHILE_INF:
3493                 nesting++;
3494                 loopNesting++;
3495                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3496                 stateStack[nesting].header       = i;
3497                 stateStack[nesting].tripCount    = 0;
3498                 stateStack[nesting].isLoop       = 1;
3499                 stateStack[nesting].isSwitch     = 0;
3500                 stateStack[nesting].continueMask = 0;
3501                 break;
3502             case OP_END_FOR_INF:
3503                 stateStack[nesting].tripCount++;
3504                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3505                 stateStack[nesting].continueMask = 0;
3506                 if (stateStack[nesting].activeMask.any())
3507                 {
3508                     // output expected OP_BALLOT values
3509                     simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3510 
3511                     i = stateStack[nesting].header + 1;
3512                     continue;
3513                 }
3514                 else
3515                 {
3516                     loopNesting--;
3517                     nesting--;
3518                 }
3519                 break;
3520             case OP_END_DO_WHILE_INF:
3521                 stateStack[nesting].tripCount++;
3522                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3523                 stateStack[nesting].continueMask = 0;
3524                 if (stateStack[nesting].activeMask.any())
3525                 {
3526                     i = stateStack[nesting].header + 1;
3527                     continue;
3528                 }
3529                 else
3530                 {
3531                     loopNesting--;
3532                     nesting--;
3533                 }
3534                 break;
3535             case OP_BREAK:
3536             {
3537                 uint32_t n        = nesting;
3538                 bitset_inv_t mask = stateStack[nesting].activeMask;
3539                 while (true)
3540                 {
3541                     stateStack[n].activeMask &= ~mask;
3542                     if (stateStack[n].isLoop || stateStack[n].isSwitch)
3543                         break;
3544 
3545                     n--;
3546                 }
3547             }
3548             break;
3549             case OP_CONTINUE:
3550             {
3551                 uint32_t n        = nesting;
3552                 bitset_inv_t mask = stateStack[nesting].activeMask;
3553                 while (true)
3554                 {
3555                     stateStack[n].activeMask &= ~mask;
3556                     if (stateStack[n].isLoop)
3557                     {
3558                         stateStack[n].continueMask |= mask;
3559                         break;
3560                     }
3561                     n--;
3562                 }
3563             }
3564             break;
3565             case OP_ELECT:
3566             {
3567                 nesting++;
3568                 stateStack[nesting].activeMask = bitsetElect(stateStack[nesting - 1].activeMask, subgroupSize);
3569                 stateStack[nesting].header     = i;
3570                 stateStack[nesting].isLoop     = 0;
3571                 stateStack[nesting].isSwitch   = 0;
3572             }
3573             break;
3574             case OP_RETURN:
3575             {
3576                 bitset_inv_t mask = stateStack[nesting].activeMask;
3577                 for (int32_t n = nesting; n >= 0; --n)
3578                 {
3579                     stateStack[n].activeMask &= ~mask;
3580                     if (stateStack[n].isCall)
3581                         break;
3582                 }
3583             }
3584             break;
3585 
3586             case OP_CALL_BEGIN:
3587                 nesting++;
3588                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3589                 stateStack[nesting].isLoop     = 0;
3590                 stateStack[nesting].isSwitch   = 0;
3591                 stateStack[nesting].isCall     = 1;
3592                 break;
3593             case OP_CALL_END:
3594                 stateStack[nesting].isCall = 0;
3595                 nesting--;
3596                 break;
3597             case OP_NOISE:
3598                 break;
3599 
3600             case OP_SWITCH_UNIF_BEGIN:
3601             case OP_SWITCH_VAR_BEGIN:
3602             case OP_SWITCH_LOOP_COUNT_BEGIN:
3603                 nesting++;
3604                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3605                 stateStack[nesting].header     = i;
3606                 stateStack[nesting].isLoop     = 0;
3607                 stateStack[nesting].isSwitch   = 1;
3608                 break;
3609             case OP_SWITCH_END:
3610                 nesting--;
3611                 break;
3612             case OP_CASE_MASK_BEGIN:
3613                 stateStack[nesting].activeMask =
3614                     stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3615                 break;
3616             case OP_CASE_LOOP_COUNT_BEGIN:
3617             {
3618                 uint32_t n = nesting;
3619                 uint32_t l = loopNesting;
3620 
3621                 while (true)
3622                 {
3623                     if (stateStack[n].isLoop)
3624                     {
3625                         l--;
3626                         if (l == ops[stateStack[nesting].header].value)
3627                             break;
3628                     }
3629                     n--;
3630                 }
3631 
3632                 if ((1ULL << stateStack[n].tripCount) & ops[i].value)
3633                     stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3634                 else
3635                     stateStack[nesting].activeMask = 0;
3636                 break;
3637             }
3638             case OP_CASE_END:
3639                 break;
3640 
3641             default:
3642                 DE_ASSERT(0);
3643                 break;
3644             }
3645             i++;
3646         }
3647         uint32_t maxLoc = 0;
3648         for (uint32_t id = 0; id < (uint32_t)outLoc.size(); ++id)
3649             maxLoc = de::max(maxLoc, outLoc[id]);
3650 
3651         return maxLoc;
3652     }
3653 };
3654 
3655 class TessEvalRandomProgram : public RandomProgram
3656 {
3657 public:
TessEvalRandomProgram(add_cref<CaseDef> c,uint32_t invocationCount=0)3658     TessEvalRandomProgram(add_cref<CaseDef> c, uint32_t invocationCount = 0)
3659         : RandomProgram(c, (invocationCount ? invocationCount : 64))
3660         , ifLocalInvocationIndexAsSubgroupInvocationID(false)
3661     {
3662         DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT);
3663     }
3664     virtual ~TessEvalRandomProgram() = default;
3665 
3666     const bool ifLocalInvocationIndexAsSubgroupInvocationID;
3667     static const uint32_t quadInvocationCount = 4;
3668 
3669     // Simulate execution of the program. If countOnly is true, just return
3670     // the max number of outputs written. If it's false, store out the result
3671     // values to ref.
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)3672     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
3673     {
3674         SubgroupState stateStack[10];
3675         deMemset(&stateStack, 0, sizeof(stateStack));
3676 
3677         // Per-invocation output location counters
3678         std::vector<uint32_t> outLoc(invocationStride, 0u);
3679 
3680         nesting     = 0;
3681         loopNesting = 0;
3682 
3683         for (uint32_t k = 0; k < invocationStride; ++k)
3684             stateStack[nesting].activeMask.set(k);
3685 
3686         int32_t i = 0;
3687         while (i < (int32_t)ops.size())
3688         {
3689             switch (ops[i].type)
3690             {
3691             case OP_BALLOT:
3692                 simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3693                 break;
3694             case OP_STORE:
3695                 simulateStoreToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3696                 break;
3697             case OP_IF_MASK:
3698                 nesting++;
3699                 stateStack[nesting].activeMask =
3700                     stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3701                 stateStack[nesting].header   = i;
3702                 stateStack[nesting].isLoop   = 0;
3703                 stateStack[nesting].isSwitch = 0;
3704                 break;
3705             case OP_ELSE_MASK:
3706                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3707                                                  ~bitsetFromU64(ops[stateStack[nesting].header].value, subgroupSize);
3708                 break;
3709             case OP_IF_LOOPCOUNT:
3710             {
3711                 uint32_t n = nesting;
3712                 while (!stateStack[n].isLoop)
3713                     n--;
3714 
3715                 nesting++;
3716                 stateStack[nesting].activeMask =
3717                     stateStack[nesting - 1].activeMask & bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3718                 stateStack[nesting].header   = i;
3719                 stateStack[nesting].isLoop   = 0;
3720                 stateStack[nesting].isSwitch = 0;
3721                 break;
3722             }
3723             case OP_ELSE_LOOPCOUNT:
3724             {
3725                 uint32_t n = nesting;
3726                 while (!stateStack[n].isLoop)
3727                     n--;
3728 
3729                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3730                                                  ~bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3731                 break;
3732             }
3733             case OP_IF_LOCAL_INVOCATION_INDEX: // TessEvalRandomProgram
3734             {
3735                 bitset_inv_t mask;
3736                 if (ifLocalInvocationIndexAsSubgroupInvocationID)
3737                 {
3738                     // if (gl_SubgroupInvocationID >= value), all bits >= N
3739                     for (uint32_t j = static_cast<uint32_t>(ops[i].value); j < subgroupSize; ++j)
3740                         mask.set(j);
3741                     mask = bitsetFromU64(mask.to_ullong(), subgroupSize);
3742                 }
3743                 else
3744                 {
3745                     // all bits >= N
3746                     for (uint32_t j = (uint32_t)ops[i].value; j < invocationStride; ++j)
3747                         mask.set(j);
3748                 }
3749 
3750                 nesting++;
3751                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3752                 stateStack[nesting].header     = i;
3753                 stateStack[nesting].isLoop     = 0;
3754                 stateStack[nesting].isSwitch   = 0;
3755                 break;
3756             }
3757             case OP_ELSE_LOCAL_INVOCATION_INDEX: // TessEvalRandomProgram
3758             {
3759                 // all bits < N
3760                 bitset_inv_t mask;
3761                 for (uint32_t j = 0; j < static_cast<uint32_t>(ops[i].value); ++j)
3762                     mask.set(j);
3763 
3764                 if (ifLocalInvocationIndexAsSubgroupInvocationID)
3765                 {
3766                     // else (gl_SubgroupInvocationID >= value), all bits < N
3767                     mask = bitsetFromU64(mask.to_ullong(), subgroupSize);
3768                 }
3769 
3770                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3771                 break;
3772             }
3773             case OP_ENDIF:
3774                 nesting--;
3775                 break;
3776             case OP_BEGIN_FOR_UNIF:
3777                 // XXX TODO: We don't handle a for loop with zero iterations
3778                 nesting++;
3779                 loopNesting++;
3780                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3781                 stateStack[nesting].header       = i;
3782                 stateStack[nesting].tripCount    = 0;
3783                 stateStack[nesting].isLoop       = 1;
3784                 stateStack[nesting].isSwitch     = 0;
3785                 stateStack[nesting].continueMask = 0;
3786                 break;
3787             case OP_END_FOR_UNIF:
3788                 stateStack[nesting].tripCount++;
3789                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3790                 stateStack[nesting].continueMask = 0;
3791                 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3792                     stateStack[nesting].activeMask.any())
3793                 {
3794                     i = stateStack[nesting].header + 1;
3795                     continue;
3796                 }
3797                 else
3798                 {
3799                     loopNesting--;
3800                     nesting--;
3801                 }
3802                 break;
3803             case OP_BEGIN_DO_WHILE_UNIF:
3804                 // XXX TODO: We don't handle a for loop with zero iterations
3805                 nesting++;
3806                 loopNesting++;
3807                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3808                 stateStack[nesting].header       = i;
3809                 stateStack[nesting].tripCount    = 1;
3810                 stateStack[nesting].isLoop       = 1;
3811                 stateStack[nesting].isSwitch     = 0;
3812                 stateStack[nesting].continueMask = 0;
3813                 break;
3814             case OP_END_DO_WHILE_UNIF:
3815                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3816                 stateStack[nesting].continueMask = 0;
3817                 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3818                     stateStack[nesting].activeMask.any())
3819                 {
3820                     i = stateStack[nesting].header + 1;
3821                     stateStack[nesting].tripCount++;
3822                     continue;
3823                 }
3824                 else
3825                 {
3826                     loopNesting--;
3827                     nesting--;
3828                 }
3829                 break;
3830             case OP_BEGIN_FOR_VAR:
3831                 // XXX TODO: We don't handle a for loop with zero iterations
3832                 nesting++;
3833                 loopNesting++;
3834                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3835                 stateStack[nesting].header       = i;
3836                 stateStack[nesting].tripCount    = 0;
3837                 stateStack[nesting].isLoop       = 1;
3838                 stateStack[nesting].isSwitch     = 0;
3839                 stateStack[nesting].continueMask = 0;
3840                 break;
3841             case OP_END_FOR_VAR:
3842                 stateStack[nesting].tripCount++;
3843                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3844                 stateStack[nesting].continueMask = 0;
3845                 stateStack[nesting].activeMask &= bitsetFromU64(stateStack[nesting].tripCount == subgroupSize ?
3846                                                                     0 :
3847                                                                     ~((1ULL << (stateStack[nesting].tripCount)) - 1),
3848                                                                 subgroupSize);
3849                 if (stateStack[nesting].activeMask.any())
3850                 {
3851                     i = stateStack[nesting].header + 1;
3852                     continue;
3853                 }
3854                 else
3855                 {
3856                     loopNesting--;
3857                     nesting--;
3858                 }
3859                 break;
3860             case OP_BEGIN_FOR_INF:
3861             case OP_BEGIN_DO_WHILE_INF:
3862                 nesting++;
3863                 loopNesting++;
3864                 stateStack[nesting].activeMask   = stateStack[nesting - 1].activeMask;
3865                 stateStack[nesting].header       = i;
3866                 stateStack[nesting].tripCount    = 0;
3867                 stateStack[nesting].isLoop       = 1;
3868                 stateStack[nesting].isSwitch     = 0;
3869                 stateStack[nesting].continueMask = 0;
3870                 break;
3871             case OP_END_FOR_INF:
3872                 stateStack[nesting].tripCount++;
3873                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3874                 stateStack[nesting].continueMask = 0;
3875                 if (stateStack[nesting].activeMask.any())
3876                 {
3877                     // output expected OP_BALLOT values
3878                     simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3879 
3880                     i = stateStack[nesting].header + 1;
3881                     continue;
3882                 }
3883                 else
3884                 {
3885                     loopNesting--;
3886                     nesting--;
3887                 }
3888                 break;
3889             case OP_END_DO_WHILE_INF:
3890                 stateStack[nesting].tripCount++;
3891                 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3892                 stateStack[nesting].continueMask = 0;
3893                 if (stateStack[nesting].activeMask.any())
3894                 {
3895                     i = stateStack[nesting].header + 1;
3896                     continue;
3897                 }
3898                 else
3899                 {
3900                     loopNesting--;
3901                     nesting--;
3902                 }
3903                 break;
3904             case OP_BREAK:
3905             {
3906                 uint32_t n        = nesting;
3907                 bitset_inv_t mask = stateStack[nesting].activeMask;
3908                 while (true)
3909                 {
3910                     stateStack[n].activeMask &= ~mask;
3911                     if (stateStack[n].isLoop || stateStack[n].isSwitch)
3912                         break;
3913 
3914                     n--;
3915                 }
3916             }
3917             break;
3918             case OP_CONTINUE:
3919             {
3920                 uint32_t n        = nesting;
3921                 bitset_inv_t mask = stateStack[nesting].activeMask;
3922                 while (true)
3923                 {
3924                     stateStack[n].activeMask &= ~mask;
3925                     if (stateStack[n].isLoop)
3926                     {
3927                         stateStack[n].continueMask |= mask;
3928                         break;
3929                     }
3930                     n--;
3931                 }
3932             }
3933             break;
3934             case OP_ELECT:
3935             {
3936                 nesting++;
3937                 stateStack[nesting].activeMask = bitsetElect(stateStack[nesting - 1].activeMask, subgroupSize);
3938                 stateStack[nesting].header     = i;
3939                 stateStack[nesting].isLoop     = 0;
3940                 stateStack[nesting].isSwitch   = 0;
3941             }
3942             break;
3943             case OP_RETURN:
3944             {
3945                 bitset_inv_t mask = stateStack[nesting].activeMask;
3946                 for (int32_t n = nesting; n >= 0; --n)
3947                 {
3948                     stateStack[n].activeMask &= ~mask;
3949                     if (stateStack[n].isCall)
3950                         break;
3951                 }
3952             }
3953             break;
3954 
3955             case OP_CALL_BEGIN:
3956                 nesting++;
3957                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3958                 stateStack[nesting].isLoop     = 0;
3959                 stateStack[nesting].isSwitch   = 0;
3960                 stateStack[nesting].isCall     = 1;
3961                 break;
3962             case OP_CALL_END:
3963                 stateStack[nesting].isCall = 0;
3964                 nesting--;
3965                 break;
3966             case OP_NOISE:
3967                 break;
3968 
3969             case OP_SWITCH_UNIF_BEGIN:
3970             case OP_SWITCH_VAR_BEGIN:
3971             case OP_SWITCH_LOOP_COUNT_BEGIN:
3972                 nesting++;
3973                 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3974                 stateStack[nesting].header     = i;
3975                 stateStack[nesting].isLoop     = 0;
3976                 stateStack[nesting].isSwitch   = 1;
3977                 break;
3978             case OP_SWITCH_END:
3979                 nesting--;
3980                 break;
3981             case OP_CASE_MASK_BEGIN:
3982                 stateStack[nesting].activeMask =
3983                     stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3984                 break;
3985             case OP_CASE_LOOP_COUNT_BEGIN:
3986             {
3987                 uint32_t n = nesting;
3988                 uint32_t l = loopNesting;
3989 
3990                 while (true)
3991                 {
3992                     if (stateStack[n].isLoop)
3993                     {
3994                         l--;
3995                         if (l == ops[stateStack[nesting].header].value)
3996                             break;
3997                     }
3998                     n--;
3999                 }
4000 
4001                 if ((1ULL << stateStack[n].tripCount) & ops[i].value)
4002                     stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
4003                 else
4004                     stateStack[nesting].activeMask = 0;
4005                 break;
4006             }
4007             case OP_CASE_END:
4008                 break;
4009 
4010             default:
4011                 DE_ASSERT(0);
4012                 break;
4013             }
4014             i++;
4015         }
4016         uint32_t maxLoc = 0;
4017         for (uint32_t id = 0; id < (uint32_t)outLoc.size(); ++id)
4018             maxLoc = de::max(maxLoc, outLoc[id]);
4019 
4020         return maxLoc;
4021     }
4022 
4023 protected:
genIf(IFType ifType,uint32_t)4024     virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
4025     {
4026         RandomProgram::genIf(ifType, std::min(64u, (caseDef.sizeX * quadInvocationCount - 1)));
4027     }
4028 
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)4029     virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4030     {
4031         // uint invocationIndex() { return gl_PrimitiveID * width + gl_SubgroupInvocationID; }
4032         printIndent(css);
4033         css << "if (";
4034         if (ifLocalInvocationIndexAsSubgroupInvocationID)
4035             css << "gl_SubgroupInvocationID";
4036         else
4037             css << "((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) + gl_SubgroupInvocationID)";
4038         css << " >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
4039     }
4040 
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)4041     virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4042     {
4043         printIndent(css);
4044         css << "outputC.loc[invocationIndex()]++;\n";
4045         printIndent(css);
4046         css << "outputB.b[(outLoc++)*invocationStride + invocationIndex()].x = 0x" << std::hex
4047             << flow.ops[flow.opsIndex].value << ";\n";
4048     }
4049 
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)4050     virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
4051                              bool endWithSemicolon = false) override
4052     {
4053         printIndent(css);
4054 
4055         css << "outputC.loc[invocationIndex()]++,";
4056         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
4057         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
4058         // subgroup_uniform_control_flow, since we only validate results that must be fully
4059         // reconverged.
4060         if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
4061         {
4062             css << "outputB.b[(outLoc++)*invocationStride + invocationIndex()] = " << getPartitionBallotText() << ".xy";
4063         }
4064         else
4065         {
4066             css << "outputB.b[(outLoc++)*invocationStride + invocationIndex()] = subgroupBallot(true).xy";
4067         }
4068         if (endWithSemicolon)
4069         {
4070             css << ";\n";
4071         }
4072     }
4073 
simulateStoreToChange(bool countOnly,uint32_t,const SubgroupState (& stateStack)[10],int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)4074     void simulateStoreToChange(bool countOnly, uint32_t /*subgroupSize*/, const SubgroupState (&stateStack)[10],
4075                                int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
4076                                add_ref<std::vector<uint64_t>> ref)
4077     {
4078         for (uint32_t id = 0; id < invocationStride; ++id)
4079         {
4080             if (stateStack[nesting].activeMask.test(id))
4081             {
4082                 if (countOnly)
4083                     outLoc[id]++;
4084                 else
4085                     ref[(outLoc[id]++) * invocationStride + id] = ops[opsIndex].value;
4086             }
4087         }
4088     }
4089 
simulateBallotToChange(bool countOnly,uint32_t subgroupSize,const SubgroupState (& stateStack)[10],uint32_t,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)4090     void simulateBallotToChange(bool countOnly, uint32_t subgroupSize, const SubgroupState (&stateStack)[10],
4091                                 uint32_t /*opsIndex*/, add_ref<std::vector<uint32_t>> outLoc,
4092                                 add_ref<std::vector<uint64_t>> ref)
4093     {
4094         for (uint32_t id = 0; id < invocationStride; ++id)
4095         {
4096             if (stateStack[nesting].activeMask.test(id))
4097             {
4098                 if (countOnly)
4099                     outLoc[id]++;
4100                 else
4101                     ref[(outLoc[id]++) * invocationStride + id] =
4102                         bitsetToU64(stateStack[nesting].activeMask, subgroupSize, id);
4103             }
4104         }
4105     }
4106 };
4107 
4108 class GeometryRandomProgram : public RandomProgram
4109 {
4110 public:
4111     static const constexpr uint32_t fillPercentage = 71u;
GeometryRandomProgram(add_cref<CaseDef> c)4112     GeometryRandomProgram(add_cref<CaseDef> c)
4113         : RandomProgram(c, Arrangement::calculatePrimitiveCount(c.sizeX, c.sizeY, fillPercentage))
4114     {
4115         DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_GEOMETRY_BIT);
4116     }
4117     virtual ~GeometryRandomProgram() = default;
4118 
4119     struct Arrangement : Prerequisites
4120     {
4121         static constexpr uint32_t NUM_SUBGROUPS_OFFSET    = 0u;
4122         static constexpr uint32_t SUBGROUP_SIZE_OFFSET    = 1u;
4123         static constexpr uint32_t INVOCATION_COUNT_OFFSET = 2u;
4124         static constexpr uint32_t MAX_LOC_OFFSET          = 3u;
4125         static constexpr uint32_t MAX_IDENTITY_OFFSET     = 4u;
4126         static constexpr uint32_t INVOCATION_ENTRY_OFFSET = 5u;
4127 
4128         const uint32_t m_shaderSubgroupSize;
4129         const uint32_t m_shaderSubgroupCount;
4130         const uint32_t m_shaderInvocationCount;
4131         const uint32_t m_shaderMaxLoc;
4132         const uint32_t m_shaderMaxIdentity;
4133 
4134         const uint32_t m_subgroupSize;
4135         const uint32_t m_primitiveStride;
4136         const uint32_t m_invocationStride;
4137         const uint32_t m_subgroupCount;
4138         const Ballots m_initialBallots;
4139         const std::vector<uint32_t> m_primitiveSubgroups;
4140 
Arrangementvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4141         Arrangement(add_cref<std::vector<uint32_t>> outputP, uint32_t subgroupSize, uint32_t primitiveStride)
4142             : m_shaderSubgroupSize(outputP.at(SUBGROUP_SIZE_OFFSET))
4143             , m_shaderSubgroupCount(outputP.at(NUM_SUBGROUPS_OFFSET))
4144             , m_shaderInvocationCount(outputP.at(INVOCATION_COUNT_OFFSET))
4145             , m_shaderMaxLoc(outputP.at(MAX_LOC_OFFSET))
4146             , m_shaderMaxIdentity(outputP.at(MAX_IDENTITY_OFFSET))
4147             , m_subgroupSize(subgroupSize)
4148             , m_primitiveStride(primitiveStride)
4149             , m_invocationStride(primitiveStride)
4150             , m_subgroupCount(ROUNDUP(primitiveStride, subgroupSize) / subgroupSize)
4151             , m_initialBallots(makeInitialBallots(outputP))
4152             , m_primitiveSubgroups(makePrimitiveSubgroups(outputP))
4153         {
4154         }
makeInitialBallotsvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4155         static Ballots makeInitialBallots(add_cref<std::vector<uint32_t>> outputP)
4156         {
4157             const uint32_t subgroupCount = outputP.at(NUM_SUBGROUPS_OFFSET);
4158             const uint32_t subgroupSize  = outputP.at(SUBGROUP_SIZE_OFFSET);
4159             DE_UNREF(subgroupSize);
4160             const uint32_t primitiveStride = outputP.at(INVOCATION_COUNT_OFFSET);
4161             Ballots b(subgroupCount);
4162             for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
4163             {
4164                 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRY_OFFSET);
4165                 if (id)
4166                 {
4167                     const uint32_t subgroupID           = (id >> 16) - 1u;
4168                     const uint32_t subgroupInvocationID = id & 0xFFFF;
4169                     DE_ASSERT(subgroupID < subgroupCount);
4170                     DE_ASSERT(subgroupInvocationID < subgroupSize);
4171                     b.at(subgroupID).set(subgroupInvocationID);
4172                 }
4173             }
4174             return b;
4175         }
makePrimitiveSubgroupsvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4176         static std::vector<uint32_t> makePrimitiveSubgroups(add_cref<std::vector<uint32_t>> outputP)
4177         {
4178             const uint32_t subgroupSize    = outputP.at(SUBGROUP_SIZE_OFFSET);
4179             const uint32_t primitiveStride = outputP.at(INVOCATION_COUNT_OFFSET);
4180             std::vector<uint32_t> map(primitiveStride);
4181             for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
4182             {
4183                 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRY_OFFSET);
4184                 if (id)
4185                 {
4186                     const uint32_t subgroupID           = (id >> 16) - 1u;
4187                     const uint32_t subgroupInvocationID = id & 0xFFFF;
4188                     DE_ASSERT(subgroupInvocationID < subgroupSize);
4189                     map.at(primitiveID) = subgroupID * subgroupSize + subgroupInvocationID;
4190                 }
4191             }
4192             return map;
4193         }
calculatePrimitiveCountvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4194         static uint32_t calculatePrimitiveCount(uint32_t width, uint32_t height, uint32_t fillPercent)
4195         {
4196             deRandom rnd;
4197             std::map<uint32_t, int> map;
4198             std::vector<tcu::Vec4> points;
4199             const uint32_t frags = (width * height);
4200             const uint32_t total = (frags * fillPercent) / 100u;
4201 
4202             deRandom_init(&rnd, (width * height));
4203 
4204             for (uint32_t i = 0u; i < total; ++i)
4205             {
4206                 const uint32_t r = deRandom_getUint32(&rnd) % frags;
4207                 if (map[r] != 0)
4208                 {
4209                     i -= 1;
4210                     continue;
4211                 }
4212                 map[r] = 1;
4213             }
4214 
4215             return static_cast<uint32_t>(map.size());
4216         }
generatePrimitivesvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4217         static std::vector<tcu::Vec4> generatePrimitives(uint32_t width, uint32_t height, uint32_t fillPercent)
4218         {
4219             deRandom rnd;
4220             std::map<uint32_t, int> map;
4221             std::vector<tcu::Vec4> points;
4222             const uint32_t frags = (width * height);
4223             const uint32_t total = (frags * fillPercent) / 100u;
4224 
4225             deRandom_init(&rnd, (width * height));
4226 
4227             for (uint32_t i = 0u; i < total; ++i)
4228             {
4229                 const uint32_t r = deRandom_getUint32(&rnd) % frags;
4230                 if (map[r] != 0)
4231                 {
4232                     i -= 1;
4233                     continue;
4234                 }
4235                 map[r] = 1;
4236 
4237                 uint32_t y = r / width;
4238                 uint32_t x = r % width;
4239                 float xx   = (float(x) + float(x + 1)) / (2.0f * float(width));
4240                 float yy   = (float(y) + float(y + 1)) / (2.0f * float(height));
4241                 float xxx  = xx * 2.0f - 1.0f;
4242                 float yyy  = yy * 2.0f - 1.0f;
4243                 points.emplace_back(tcu::Vec4(xxx, yyy, 0u, 0u));
4244             }
4245             return points;
4246         }
generateVectorOutputPvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4247         static std::vector<uint32_t> generateVectorOutputP(uint32_t subgroupSize, uint32_t primitiveStride)
4248         {
4249             const uint32_t subgroupCount = ROUNDUP(primitiveStride, subgroupSize) / subgroupSize;
4250             std::vector<uint32_t> outputP(primitiveStride + INVOCATION_ENTRY_OFFSET);
4251             outputP.at(NUM_SUBGROUPS_OFFSET)    = subgroupCount;
4252             outputP.at(SUBGROUP_SIZE_OFFSET)    = subgroupSize;
4253             outputP.at(INVOCATION_COUNT_OFFSET) = primitiveStride;
4254             outputP.at(MAX_LOC_OFFSET)          = 0u;
4255             outputP.at(MAX_IDENTITY_OFFSET)     = 0u;
4256             for (uint32_t vertexID = 0u; vertexID < primitiveStride; ++vertexID)
4257             {
4258                 const uint32_t subgroupID                      = vertexID / subgroupSize;
4259                 const uint32_t subgroupInvocationID            = vertexID % subgroupSize;
4260                 outputP.at(vertexID + INVOCATION_ENTRY_OFFSET) = ((subgroupID + 1u) << 16) | subgroupInvocationID;
4261             }
4262             return outputP;
4263         }
generateVectorOutputPvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4264         static std::vector<uint32_t> generateVectorOutputP(uint32_t subgroupSize, uint32_t width, uint32_t height,
4265                                                            uint32_t percent)
4266         {
4267             const uint32_t primitiveStride = calculatePrimitiveCount(width, height, percent);
4268             return generateVectorOutputP(subgroupSize, primitiveStride);
4269         }
4270     };
4271 
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)4272     virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
4273     {
4274         DE_ASSERT(false); // use overloaded version of simulate() instead
4275         DE_UNREF(countOnly);
4276         DE_UNREF(subgroupSize);
4277         DE_UNREF(ref);
4278         return 0;
4279     }
4280 
4281 protected:
genIf(IFType ifType,uint32_t)4282     virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
4283     {
4284         RandomProgram::genIf(ifType, RandomProgram::invocationStride);
4285     }
4286 
getPartitionBallotText()4287     virtual std::string getPartitionBallotText() override
4288     {
4289         return "storeValue(outLoc++, subgroupBallot(true))";
4290     }
4291 
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)4292     virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4293     {
4294         printIndent(css);
4295         css << "if (invocationIndex() >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
4296     }
4297 
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)4298     virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4299     {
4300         printIndent(css);
4301         css << "storeValue(outLoc++, 0x" << std::hex << flow.ops[flow.opsIndex].value << std::dec << ");\n";
4302     }
4303 
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)4304     virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
4305                              bool endWithSemicolon = false) override
4306     {
4307         printIndent(css);
4308         // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
4309         // a ballot, to make sure the ballot is "diverged enough". Don't do this for
4310         // subgroup_uniform_control_flow, since we only validate results that must be fully
4311         // reconverged.
4312         if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
4313         {
4314             css << getPartitionBallotText();
4315         }
4316         else
4317         {
4318             css << "storeValue(outLoc++, subgroupBallot(true))";
4319         }
4320         if (endWithSemicolon)
4321         {
4322             css << ";\n";
4323         }
4324     }
4325 
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)4326     virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
4327                                 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
4328                                 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
4329                                 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
4330                                 const OPType reason, const tcu::UVec4 *cmp) override
4331     {
4332         DE_UNREF(unusedPrimitiveID);
4333         DE_UNREF(opsIndex);
4334         add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
4335         for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
4336         {
4337             const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
4338             DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
4339             if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
4340                 continue;
4341             const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
4342             if (false == countOnly)
4343             {
4344                 ref.at(index) = Ballot(activeMask.at(sgid / a.m_subgroupSize));
4345                 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
4346                 {
4347                     logFailureCount -= 1u;
4348                     log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
4349                         << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
4350                 }
4351             }
4352         }
4353     }
4354 
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)4355     virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
4356                                const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
4357                                add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
4358                                std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
4359                                const OPType reason, const tcu::UVec4 *cmp) override
4360     {
4361         DE_UNREF(unusedPrimitiveID);
4362         add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
4363         for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
4364         {
4365             const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
4366             DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
4367             if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
4368                 continue;
4369             const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
4370             if (false == countOnly)
4371             {
4372                 ref.at(index) = Ballot(tcu::UVec4(uint32_t(storeValue & 0xFFFFFFFF), 0u, 0u, 0u));
4373                 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
4374                 {
4375                     logFailureCount -= 1u;
4376                     log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
4377                         << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
4378                 }
4379             }
4380         }
4381     }
4382 
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)4383     virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
4384                                                              const uint32_t subgroupSize, const uint32_t fragmentStride,
4385                                                              const uint32_t primitiveStride,
4386                                                              add_ref<std::vector<SubgroupState2>> stateStack,
4387                                                              add_ref<std::vector<uint32_t>> outLoc,
4388                                                              add_ref<uint32_t> subgroupCount) override
4389     {
4390         DE_UNREF(fragmentStride);
4391         auto prerequisites = std::make_shared<Arrangement>(outputP, subgroupSize, primitiveStride);
4392         subgroupCount      = prerequisites->m_subgroupCount;
4393         stateStack.resize(10u, SubgroupState2(subgroupCount));
4394         outLoc.resize(primitiveStride, 0u);
4395         stateStack.at(0).activeMask = prerequisites->m_initialBallots;
4396         return prerequisites;
4397     }
4398 };
4399 
4400 class ReconvergenceTestCase : public TestCase
4401 {
4402 public:
ReconvergenceTestCase(tcu::TestContext & context,const std::string & name,const CaseDef data)4403     ReconvergenceTestCase(tcu::TestContext &context, const std::string &name, const CaseDef data)
4404         : TestCase(context, name)
4405         , m_data(data)
4406         , m_program()
4407         , m_subgroupSizeToMaxLoc()
4408     {
4409     }
4410     ~ReconvergenceTestCase(void) = default;
4411     virtual void delayedInit(void) override;
4412     virtual void checkSupport(Context &context) const override;
4413     virtual void initPrograms(SourceCollections &programCollection) const override;
4414     virtual TestInstance *createInstance(Context &context) const override;
4415     de::MovePtr<RandomProgram> selectProgram() const;
4416 
4417 private:
4418     CaseDef m_data;
4419     std::shared_ptr<RandomProgram> m_program;
4420     mutable std::map<uint32_t, uint32_t> m_subgroupSizeToMaxLoc;
4421 };
4422 
checkSupport(Context & context) const4423 void ReconvergenceTestCase::checkSupport(Context &context) const
4424 {
4425     if (!context.contextSupports(vk::ApiVersion(0u, 1u, 1u, 0u)))
4426         TCU_THROW(NotSupportedError, "Vulkan 1.1 not supported");
4427 
4428     const auto properties                                            = getSubgroupProperties(context);
4429     const vk::VkPhysicalDeviceSubgroupProperties &subgroupProperties = properties.first;
4430     const VkPhysicalDeviceLimits &limits                             = properties.second.properties.limits;
4431 
4432     if (m_data.isElect() && !(subgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BASIC_BIT))
4433         TCU_THROW(NotSupportedError, "VK_SUBGROUP_FEATURE_BASIC_BIT not supported");
4434 
4435     if (!m_data.isElect() && !(subgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT))
4436         TCU_THROW(NotSupportedError, "VK_SUBGROUP_FEATURE_BALLOT_BIT not supported");
4437 
4438     if (m_data.shaderStage == VK_SHADER_STAGE_COMPUTE_BIT)
4439     {
4440         if ((m_data.sizeX > limits.maxComputeWorkGroupSize[0]) || (m_data.sizeY > limits.maxComputeWorkGroupSize[1]) ||
4441             ((m_data.sizeX * m_data.sizeY) > limits.maxComputeWorkGroupInvocations))
4442         {
4443             TCU_THROW(NotSupportedError, "compute workgroup count exceeds device limit");
4444         }
4445     }
4446 
4447     if (!(subgroupProperties.supportedStages & m_data.shaderStage))
4448     {
4449         std::stringstream ss;
4450         ss << getShaderStageFlagsStr(m_data.shaderStage);
4451         ss << " does not support subgroup operations";
4452         ss.flush();
4453         TCU_THROW(NotSupportedError, ss.str());
4454     }
4455 
4456     // Both subgroup- AND workgroup-uniform tests are enabled by shaderSubgroupUniformControlFlow.
4457     if (m_data.isUCF() && !context.getShaderSubgroupUniformControlFlowFeatures().shaderSubgroupUniformControlFlow)
4458         TCU_THROW(NotSupportedError, "shaderSubgroupUniformControlFlow not supported");
4459 
4460     if (m_data.testType == TT_MAXIMAL && !context.getShaderMaximalReconvergenceFeatures().shaderMaximalReconvergence)
4461         TCU_THROW(NotSupportedError, "shaderMaximalReconvergence not supported");
4462 }
4463 
selectProgram() const4464 de::MovePtr<RandomProgram> ReconvergenceTestCase::selectProgram() const
4465 {
4466     RandomProgram *programPtr(nullptr);
4467     switch (m_data.shaderStage)
4468     {
4469     case VK_SHADER_STAGE_COMPUTE_BIT:
4470         programPtr = new ComputeRandomProgram(m_data);
4471         break;
4472     case VK_SHADER_STAGE_FRAGMENT_BIT:
4473         programPtr = new FragmentRandomProgram(m_data);
4474         break;
4475     case VK_SHADER_STAGE_VERTEX_BIT:
4476         programPtr = new VertexRandomProgram(m_data);
4477         break;
4478     case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
4479         programPtr = new TessCtrlRandomProgram(m_data, 0);
4480         break;
4481     case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
4482         programPtr = new TessEvalRandomProgram(m_data);
4483         break;
4484     case VK_SHADER_STAGE_GEOMETRY_BIT:
4485         programPtr = new GeometryRandomProgram(m_data);
4486         break;
4487     default:
4488         DE_ASSERT(0);
4489     }
4490     DE_ASSERT(programPtr);
4491     return de::MovePtr<RandomProgram>(programPtr);
4492 }
4493 
genPassThroughFragmentSource()4494 std::string genPassThroughFragmentSource()
4495 {
4496     std::stringstream str;
4497     str << "#version 450 core\n";
4498     str << "layout(location = 0) out vec4 color;\n";
4499     str << "void main() {\n";
4500     str << "  color = vec4(1.0);\n";
4501     str << "}\n";
4502     str.flush();
4503     return str.str();
4504 }
4505 
genPassThroughVertexSource()4506 std::string genPassThroughVertexSource()
4507 {
4508     std::stringstream str;
4509     str << "#version 450 core\n";
4510     str << "layout(location = 0) in vec4 pos;\n";
4511     str << "void main() {\n";
4512     str << "   gl_Position = vec4(pos.xy, 0.0, 1.0);\n";
4513     str << "}\n";
4514     str.flush();
4515     return str.str();
4516 }
4517 
genPassThroughTessCtrlSource()4518 std::string genPassThroughTessCtrlSource()
4519 {
4520     std::stringstream str;
4521     str << "#version 450 core\n";
4522     str << "#extension GL_EXT_tessellation_shader : require\n";
4523     str << "layout(vertices = 3) out;\n";
4524     str << "void main() {\n";
4525     str << "   gl_out[gl_InvocationID].gl_Position = gl_in[gl_InvocationID].gl_Position;\n";
4526     str << "   gl_TessLevelOuter[0] = 1.0;\n";
4527     str << "   gl_TessLevelOuter[1] = 1.0;\n";
4528     str << "   gl_TessLevelOuter[2] = 1.0;\n";
4529     str << "   gl_TessLevelOuter[3] = 1.0;\n";
4530     str << "   gl_TessLevelInner[0] = 1.0;\n";
4531     str << "   gl_TessLevelInner[1] = 1.0;\n";
4532     str << "}\n";
4533     str.flush();
4534     return str.str();
4535 }
4536 
genPassThroughTessEvalSource()4537 std::string genPassThroughTessEvalSource()
4538 {
4539     std::stringstream str;
4540     str << "#version 450 core\n";
4541     str << "#extension GL_EXT_tessellation_shader : require\n";
4542     str << "layout(equal_spacing, triangles) in;\n";
4543     str << "void main() {\n";
4544     str << "   float u = gl_TessCoord.x;\n";
4545     str << "   float v = gl_TessCoord.y;\n";
4546     str << "   float w = gl_TessCoord.z;\n";
4547     str << "   vec4 p0 = vec4(gl_in[0].gl_Position.xy, 0.0, 1.0);\n";
4548     str << "   vec4 p1 = vec4(gl_in[1].gl_Position.xy, 0.0, 1.0);\n";
4549     str << "   vec4 p2 = vec4(gl_in[2].gl_Position.xy, 0.0, 1.0);\n";
4550     str << "   gl_Position = u * p0 + v * p1 + w * p2;\n";
4551     str << "}\n";
4552     str.flush();
4553     return str.str();
4554 }
4555 
delayedInit(void)4556 void ReconvergenceTestCase::delayedInit(void)
4557 {
4558     m_program = std::shared_ptr<RandomProgram>(selectProgram().release());
4559 }
4560 
initPrograms(SourceCollections & programCollection) const4561 void ReconvergenceTestCase::initPrograms(SourceCollections &programCollection) const
4562 {
4563     de::MovePtr<RandomProgram> program = selectProgram();
4564 
4565     m_subgroupSizeToMaxLoc = program->generateRandomProgram(m_testCtx.getWatchDog(), m_testCtx.getLog());
4566 
4567     std::stringstream header, layout, globals, prologue, epilogue, aux;
4568 
4569     header << "#version 450 core\n";
4570     header << "#extension GL_KHR_shader_subgroup_ballot : enable\n";
4571     header << "#extension GL_KHR_shader_subgroup_vote : enable\n";
4572     header << "#extension GL_NV_shader_subgroup_partitioned : enable\n";
4573     header << "#extension GL_EXT_subgroup_uniform_control_flow : enable\n";
4574     if (m_data.testType == TT_MAXIMAL)
4575     {
4576         header << "#extension GL_EXT_maximal_reconvergence : require\n";
4577     }
4578     switch (m_data.shaderStage)
4579     {
4580     case VK_SHADER_STAGE_COMPUTE_BIT:
4581         layout << "layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z = 1) in;\n";
4582         layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4583         layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4584         layout << "layout(set=0, binding=0) coherent buffer InputA  { uint  a[]; } inputA;\n";
4585         break;
4586     case VK_SHADER_STAGE_FRAGMENT_BIT:
4587         layout << "// NOTE: A fragment can belong to more than one primitive, and the shader processes each\n";
4588         layout << "//       fragment primitive by primitive, so the number of invocation does not have to be\n";
4589         layout << "//       equal to the number of fragments of the rendering area. Another important thing\n";
4590         layout << "//       is that the Implementation is free to change the order of draving primitives\n";
4591         layout << "//       between subsequent application calls.\n";
4592 
4593         layout << "// inputA.a[ invocationStride ] = { 0, 1, ..., (invocationStride - 1) }\n";
4594         layout << "layout(set=0, binding=0) coherent buffer InputA  { uint  a[]; } inputA;\n";
4595 
4596         layout << "// outputB.b[ max(loc[]) * invocationStride * primitiveStride ]\n";
4597         layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4598 
4599         layout << "// outputC.c[invocationStride * primitiveStride ], incremented per primitive\n";
4600         layout << "layout(set=0, binding=2) coherent buffer OutputC { uint  loc[]; } outputC;\n";
4601 
4602         layout << "// outputP.p[ width * height * primitiveStride + 1 ], one more for calculating subgroupID\n";
4603         layout << "layout(set=0, binding=3) coherent buffer OutputP { uint  p[]; } outputP;\n";
4604 
4605         layout << "layout(location = 0) out vec4 dEQP_FragColor;\n";
4606         break;
4607     case VK_SHADER_STAGE_VERTEX_BIT:
4608         layout << "layout(location = 0) in vec4 pos;\n";
4609         layout << "layout(set=0, binding=3) coherent buffer OutputP { uint  p[]; } outputP;\n";
4610         layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4611         layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4612         layout << "layout(set=0, binding=0) coherent buffer InputA  { uint  a[]; } inputA;\n";
4613         break;
4614     case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
4615         layout << "#extension GL_EXT_tessellation_shader : require\n";
4616         layout << "layout(vertices = " << TessCtrlRandomProgram::minSubgroupSize << ") out;\n";
4617         layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4618         layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec2 b[]; } outputB;\n";
4619         layout << "layout(set=0, binding=0) coherent buffer InputA  { uint  a[]; } inputA;\n";
4620         break;
4621     case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
4622         layout << "#extension GL_EXT_tessellation_shader : require\n";
4623         layout << "layout(equal_spacing, quads) in;\n";
4624         layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4625         layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec2 b[]; } outputB;\n";
4626         layout << "layout(set=0, binding=0) coherent buffer InputA  { uint  a[]; } inputA;\n";
4627         break;
4628     case VK_SHADER_STAGE_GEOMETRY_BIT:
4629         layout << "#extension GL_EXT_geometry_shader : require\n";
4630         layout << "layout(points) in;\n";
4631         layout << "layout(points, max_vertices = 1) out;\n";
4632         layout << "layout(set=0, binding=3) coherent buffer OutputP { uint  p[]; } outputP;\n";
4633         layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4634         layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4635         layout << "layout(set=0, binding=0) coherent buffer InputA  { uint  a[]; } inputA;\n";
4636         break;
4637     default:
4638         DE_ASSERT(0);
4639     }
4640 
4641     std::stringstream pushConstantLayout;
4642     pushConstantLayout
4643         << "layout(push_constant) uniform PC {\n"
4644            "   // set to the real stride when writing out ballots, or zero when just counting\n"
4645            "   int  invocationStride;\n"
4646            "   // wildcard fields, for an example the dimensions of rendered area in the case of graphics shaders\n"
4647            "   int  width;\n"
4648            "   int  height;\n"
4649            "   uint primitiveStride;\n"
4650            "   uint subgroupStride;\n"
4651            "   uint enableInvocationIndex;\n"
4652            "};\n";
4653     pushConstantLayout.flush();
4654     layout << pushConstantLayout.str();
4655 
4656     globals << "int outLoc = 0;\n";
4657     globals << "bool testBit(uvec4 mask, uint bit) { return ((mask[bit / 32] >> (bit % 32)) & 1) != 0; }\n";
4658     globals << "uint elect() { return int(subgroupElect()) + 1; }\n";
4659     if (m_data.shaderStage == VK_SHADER_STAGE_FRAGMENT_BIT)
4660     {
4661         static const std::string helperRoutinesCode(R"glsl(
4662         void setBit(uint bit, in out uvec4 ballot) {
4663             uint c = bit / 32;
4664             switch (c) {
4665                 case 0: ballot.x |= (1u << (bit % 32)); break;
4666                 case 1: ballot.y |= (1u << (bit % 32)); break;
4667                 case 2: ballot.z |= (1u << (bit % 32)); break;
4668                 case 3: ballot.w |= (1u << (bit % 32)); break;
4669             }
4670         }
4671         void resetBit(uint bit, in out uvec4 ballot) {
4672             uint c = bit / 32;
4673             uint mask = 0xFFFFFFFF ^ (1u << (bit % 32));
4674             switch (c) {
4675                 case 0: ballot.x &= mask; break;
4676                 case 1: ballot.y &= mask; break;
4677                 case 2: ballot.z &= mask; break;
4678                 case 3: ballot.w &= mask; break;
4679             }
4680         }
4681         uint fragmentIndex() { return (uint(gl_FragCoord.y) * width + uint(gl_FragCoord.x)); }
4682         uint invocationIndex() { return subgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; }
4683         uvec4 invocationElectBallot() {
4684             uvec4 ballot = uvec4(0);
4685             ballot[gl_SubgroupInvocationID / 32] = (1 << (gl_SubgroupInvocationID % 32));
4686             return ballot;
4687         }
4688         uint next(uint hint) {
4689             return gl_HelperInvocation
4690                 ? (hint * enableInvocationIndex)
4691                 : outputC.loc[(gl_PrimitiveID * (subgroupStride * 128) + invocationIndex()) * enableInvocationIndex]++;
4692         }
4693         uint index(uint hint) {
4694             return ((
4695                 next(hint) * (subgroupStride * 128 * primitiveStride)
4696                 + (gl_PrimitiveID * subgroupStride * 128) + invocationIndex()) * enableInvocationIndex);
4697         }
4698         void storeValue(uint hintIndex, uvec4 value)
4699         {
4700             if (gl_HelperInvocation) {
4701                 if (hintIndex < BALLOT_STACK_SIZE)
4702                     ballotStack[hintIndex] = value;
4703             }
4704             else {
4705                 outputB.b[index(hintIndex)] = value;
4706             }
4707         }
4708         void storeValue(uint hintIndex, uint value) { storeValue(hintIndex, uvec4(value, 0, 0, 0)); }
4709         void storeBallot(uint hintIndex) { storeValue(hintIndex, subgroupBallot(true)); }
4710         )glsl");
4711 
4712         static const std::string prologueCode(R"glsl(
4713         uint helperInvocationCount = 0u;
4714         uint nonHelperInvocationCount = 0u;
4715         uvec4 helperInvocationsBits = uvec4(0, 0, 0, 0);
4716         uvec4 nonHelperInvocationsBits = uvec4(0, 0, 0, 0);
4717         if (gl_HelperInvocation)
4718         {
4719             helperInvocationsBits = subgroupBallot(true);
4720             helperInvocationCount = 1u;
4721         }
4722         else
4723         {
4724             nonHelperInvocationsBits = subgroupBallot(true);
4725             nonHelperInvocationCount = 1u;
4726         }
4727 
4728         helperInvocationsBits = subgroupOr(helperInvocationsBits);
4729         nonHelperInvocationsBits = subgroupOr(nonHelperInvocationsBits);
4730         uint helperBitCount = subgroupBallotBitCount(helperInvocationsBits);
4731         uint nonHelperBitCount = subgroupBallotBitCount(nonHelperInvocationsBits);
4732         helperInvocationCount = subgroupAdd(helperInvocationCount);
4733         nonHelperInvocationCount = subgroupAdd(nonHelperInvocationCount);
4734 
4735         const uint nonHelperElectBit = subgroupBallotFindLSB(nonHelperInvocationsBits);
4736         if (gl_SubgroupInvocationID == nonHelperElectBit)
4737         {
4738             subgroupID = atomicAdd(outputP.p[width * height * primitiveStride + 0], 1);
4739             outputP.p[width * height * primitiveStride + 1] = gl_SubgroupSize;
4740             atomicAdd(outputP.p[width * height * primitiveStride + 2], nonHelperInvocationCount);
4741             atomicAdd(outputP.p[width * height * primitiveStride + 3], helperInvocationCount);
4742         }
4743 
4744         subgroupID = subgroupShuffle(subgroupID, nonHelperElectBit);
4745 
4746         const uint localPrimitiveID = gl_PrimitiveID;
4747         const uint localFragmentID = fragmentIndex();
4748 
4749         if (!gl_HelperInvocation)
4750         {
4751             outputP.p[localFragmentID * primitiveStride + localPrimitiveID] =
4752                 ((subgroupID + 1) << 16) | gl_SubgroupInvocationID;
4753         }
4754 
4755         // Maping helper invocations block
4756         {
4757             uvec4 tmpHelperBits = helperInvocationsBits;
4758             uint helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4759             while (subgroupBallotBitExtract(tmpHelperBits, helperSubgroupInvocationID))
4760             {
4761                 uint helperSubgroupID = subgroupShuffle(subgroupID, helperSubgroupInvocationID);
4762                 uint helperFragmentID = subgroupShuffle(localFragmentID, helperSubgroupInvocationID);
4763                 uint helperPrimitiveID = subgroupShuffle(localPrimitiveID, helperSubgroupInvocationID);
4764                 if (gl_SubgroupInvocationID == nonHelperElectBit)
4765                 {
4766                     outputP.p[helperFragmentID * primitiveStride + helperPrimitiveID] =
4767                         (((helperSubgroupID + 1) | 0x8000) << 16) | helperSubgroupInvocationID;
4768                 }
4769                 resetBit(helperSubgroupInvocationID, tmpHelperBits);
4770                 helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4771             }
4772         }
4773         )glsl");
4774 
4775         static const std::string epilogueCode(R"glsl(
4776         // Save helper invocations entries block
4777         {
4778             uvec4 tmpHelperBits = subgroupOr(helperInvocationsBits);
4779             uint helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4780             while (helperSubgroupInvocationID < gl_SubgroupSize)
4781             {
4782                 const uint maxOutLoc = subgroupShuffle(outLoc, helperSubgroupInvocationID);
4783                 if (maxOutLoc == 0)
4784                 {
4785                     resetBit(helperSubgroupInvocationID, tmpHelperBits);
4786                     helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4787                     continue;
4788                 }
4789 
4790                 uvec4 helperBallotStack[BALLOT_STACK_SIZE];
4791                 uint helperSubgroupID = subgroupShuffle(subgroupID, helperSubgroupInvocationID);
4792                 uint helperFragmentID = subgroupShuffle(localFragmentID, helperSubgroupInvocationID);
4793                 uint helperPrimitiveID = subgroupShuffle(localPrimitiveID, helperSubgroupInvocationID);
4794                 for (uint i = 0; i < maxOutLoc && i < BALLOT_STACK_SIZE; i++) {
4795                     helperBallotStack[i] = subgroupShuffle(ballotStack[i], helperSubgroupInvocationID);
4796                 }
4797 
4798                 if (gl_SubgroupInvocationID == nonHelperElectBit)
4799                 {
4800                     uint helperInvocationIndex = helperSubgroupID * gl_SubgroupSize + helperSubgroupInvocationID;
4801                     uint helperPrimitiveInvocationIndex = helperInvocationIndex * primitiveStride + helperPrimitiveID;
4802 
4803                     outputC.loc[(helperInvocationIndex * primitiveStride + helperPrimitiveID) * enableInvocationIndex] = maxOutLoc;
4804 
4805                     for (uint j = 0; j < maxOutLoc; j++)
4806                     {
4807                         uint outputIndex = ((j * (subgroupStride * 128u * primitiveStride)
4808                             + (helperPrimitiveID * subgroupStride * 128u) + helperInvocationIndex) * enableInvocationIndex);
4809                         uvec4 outputValue = (j < BALLOT_STACK_SIZE) ? helperBallotStack[j] : uvec4(0,0,0,0);
4810                         outputB.b[outputIndex] = outputValue;
4811                     }
4812                 }
4813                 resetBit(helperSubgroupInvocationID, tmpHelperBits);
4814                 helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4815             } // wend
4816         }
4817 
4818         dEQP_FragColor = vec4(1.0);
4819         )glsl");
4820 
4821         header << "#extension GL_KHR_shader_subgroup_shuffle : enable\n";
4822         header << "#extension GL_KHR_shader_subgroup_arithmetic : enable\n";
4823         header << "#define BALLOT_STACK_SIZE " << FragmentRandomProgram::experimentalOutLocSize << '\n';
4824 
4825         {
4826             aux << header.str();
4827             aux << pushConstantLayout.str();
4828             aux << "uint outLoc = 0;\n";
4829             aux << "struct OutputC { uint loc[1]; };\n";
4830             aux << "struct OutputB { uvec4 b[1]; };\n";
4831             aux << "uint subgroupID = 11111;\n";
4832             aux << "uvec4 ballotStack[BALLOT_STACK_SIZE];\n";
4833             aux << "OutputC outputC;\n";
4834             aux << "OutputB outputB;\n";
4835             aux << "// OutputP.p[ width * height * primitiveStride + 4 ], few more for calculating subgroupID, "
4836                    "subgroupSize, non-helper and helper invocations\n";
4837             aux << "layout(set = 0, binding = 0) coherent buffer OutputP { uint p[]; } outputP;\n";
4838             aux << "layout(location = 0) out vec4 dEQP_FragColor;\n";
4839             aux << helperRoutinesCode;
4840             aux << "void main() {\n"
4841                 << prologueCode << epilogueCode << "   \n"
4842                 << "}\n";
4843         }
4844 
4845         globals << "uint subgroupID = 22222;\n";
4846         globals << "uvec4 ballotStack[BALLOT_STACK_SIZE];\n";
4847         globals << helperRoutinesCode;
4848 
4849         prologue << prologueCode;
4850         epilogue << epilogueCode;
4851     }
4852     else if (m_data.shaderStage == VK_SHADER_STAGE_VERTEX_BIT)
4853     {
4854         static const std::string helperRoutinesCode(R"glsl(
4855         uint invocationIndex() { return subgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; }
4856         uvec4 invocationElectBallot() {
4857             uvec4 ballot = uvec4(0);
4858             ballot[gl_SubgroupInvocationID / 32] = (1 << (gl_SubgroupInvocationID % 32));
4859             return ballot;
4860         }
4861         void storeValue(uint loc, uvec4 value) {
4862             outputC.loc[gl_VertexIndex] = loc + 1u;
4863             outputB.b[(loc * invocationStride + gl_VertexIndex) * enableInvocationIndex] = value;
4864         }
4865         void storeValue(uint loc, uint value) { storeValue(loc, uvec4(value, 0, 0, 0)); }
4866         )glsl");
4867 
4868         static const std::string prologueCode(R"glsl(
4869         uint invocationCount = 1u;
4870         invocationCount = subgroupAdd(invocationCount);
4871 
4872         if (subgroupElect())
4873         {
4874             subgroupID = atomicAdd(outputP.p[NUM_SUBGROUPS_OFFSET], 1u);    // [+0]    subgroupID
4875             outputP.p[SUBGROUP_SIZE_OFFSET] = gl_SubgroupSize;                // [+1]    subgroupSize
4876             atomicAdd(outputP.p[INVOCATION_COUNT_OFFSET], invocationCount);    // [+2]    invocationCount
4877         }
4878         subgroupID = subgroupBroadcastFirst(subgroupID);
4879 
4880         outputP.p[gl_VertexIndex + INVOCATION_ENTRIES_OFFSET] = ((subgroupID + 1) << 16) | gl_SubgroupInvocationID;
4881         )glsl");
4882 
4883         static const std::string epilogueCode(R"glsl(
4884         gl_Position = vec4(pos.xy, 0.0, 1.0);
4885         gl_PointSize = 1.0;
4886         )glsl");
4887 
4888         header << "#extension GL_KHR_shader_subgroup_arithmetic : enable\n";
4889         header << "#define NUM_SUBGROUPS_OFFSET            0\n";
4890         header << "#define SUBGROUP_SIZE_OFFSET            1\n";
4891         header << "#define INVOCATION_COUNT_OFFSET        2\n";
4892         header << "#define INVOCATION_ENTRIES_OFFSET    3\n";
4893 
4894         globals << "uint subgroupID = 33333;\n";
4895         globals << helperRoutinesCode;
4896 
4897         prologue << prologueCode;
4898         epilogue << epilogueCode;
4899     }
4900     else if (m_data.shaderStage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)
4901     {
4902         // push_constant::width holds the smallest subgroup size defined in TessCtrlRandomProgram::minSubgroupSize
4903         globals << "// push_constant::width is the smallest subgroup size which this shader is run on\n";
4904         globals << "uint invocationIndex() { return ((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) "
4905                    "+ gl_SubgroupInvocationID); }\n";
4906 
4907         epilogue
4908             << "   gl_out[gl_InvocationID].gl_Position = gl_in[gl_InvocationID % gl_PatchVerticesIn].gl_Position;\n";
4909         epilogue << "   gl_TessLevelOuter[0] = 1.0;\n";
4910         epilogue << "   gl_TessLevelOuter[1] = 1.0;\n";
4911         epilogue << "   gl_TessLevelOuter[2] = 1.0;\n";
4912         epilogue << "   gl_TessLevelOuter[3] = 1.0;\n";
4913         epilogue << "   gl_TessLevelInner[0] = 1.0;\n";
4914         epilogue << "   gl_TessLevelInner[1] = 1.0;\n";
4915     }
4916     else if (m_data.shaderStage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)
4917     {
4918         globals << "// push_constant::width is an invocation count when processing a quad for a single patch\n";
4919         globals << "uint invocationIndex() { return ((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) "
4920                    "+ gl_SubgroupInvocationID); }\n";
4921 
4922         epilogue << "   float u = gl_TessCoord.x;\n";
4923         epilogue << "   float v = gl_TessCoord.y;\n";
4924         epilogue << "   float w = gl_TessCoord.z;\n";
4925         epilogue << "   vec4 p0 = vec4(gl_in[0].gl_Position.xy, 0.0, 1.0);\n";
4926         epilogue << "   vec4 p1 = vec4(gl_in[1].gl_Position.xy, 0.0, 1.0);\n";
4927         epilogue << "   vec4 p2 = vec4(gl_in[2].gl_Position.xy, 0.0, 1.0);\n";
4928         epilogue << "   gl_Position = u * p0 + v * p1 + w * p2;\n";
4929     }
4930     else if (m_data.shaderStage == VK_SHADER_STAGE_GEOMETRY_BIT)
4931     {
4932         static const std::string helperRoutinesCode(R"glsl(
4933         uint invocationIndex() { return subgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; }
4934         void storeValue(uint loc, uvec4 value) {
4935             outputC.loc[gl_PrimitiveIDIn] = loc + 1u;
4936             outputB.b[(loc * invocationStride + gl_PrimitiveIDIn) * enableInvocationIndex] = value;
4937         }
4938         void storeValue(uint loc, uint value) { storeValue(loc, uvec4(value, 0, 0, 0)); }
4939         void storeBallot(uint loc) { storeValue(loc, subgroupBallot(true)); }
4940         uvec4 invocationElectBallot() {
4941             uvec4 ballot = uvec4(0);
4942             ballot[gl_SubgroupInvocationID / 32] = (1 << (gl_SubgroupInvocationID % 32));
4943             return ballot;
4944         }
4945         )glsl");
4946 
4947         static const std::string prologueCode(R"glsl(
4948         uint invocationCount = 1u;
4949         invocationCount = subgroupAdd(invocationCount);
4950         uint identity = gl_PrimitiveIDIn + 1u;
4951         uint maxIdentity = subgroupMax(identity);
4952 
4953         if (subgroupElect()) {
4954             subgroupID = atomicAdd(outputP.p[SUBGROUP_ID_OFFSET], 1u);            // [+0]    subgroupID
4955             outputP.p[SUBGROUP_SIZE_OFFSET] = gl_SubgroupSize;                    // [+1]    subgroupSize
4956             atomicAdd(outputP.p[INVOCATION_COUNT_OFFSET], invocationCount);        // [+2]    invocationCount
4957             atomicMax(outputP.p[MAX_IDENTITY_OFFSET], maxIdentity);
4958         }
4959         subgroupID = subgroupBroadcastFirst(subgroupID);
4960 
4961         outputP.p[gl_PrimitiveIDIn + INVOCATION_ENTRY_OFFSET] = ((subgroupID + 1) << 16) | gl_SubgroupInvocationID;
4962 
4963         )glsl");
4964 
4965         static const std::string epilogueCode(R"glsl(
4966         uint maxLoc = subgroupMax(outLoc);
4967         atomicMax(outputP.p[MAX_LOC_OFFSET], maxLoc);
4968 
4969         gl_Position = gl_in[gl_PrimitiveIDIn].gl_Position;
4970         gl_PrimitiveID = gl_PrimitiveIDIn;
4971 
4972         EmitVertex();
4973         EndPrimitive();
4974         )glsl");
4975 
4976         header << "#extension GL_KHR_shader_subgroup_arithmetic : enable\n";
4977         header << "#define SUBGROUP_ID_OFFSET       0\n";
4978         header << "#define SUBGROUP_SIZE_OFFSET     1\n";
4979         header << "#define INVOCATION_COUNT_OFFSET  2\n";
4980         header << "#define MAX_LOC_OFFSET           3\n";
4981         header << "#define MAX_IDENTITY_OFFSET      4\n";
4982         header << "#define INVOCATION_ENTRY_OFFSET  5\n";
4983 
4984         globals << "uint subgroupID;\n";
4985         globals << "uint numSubgroups;\n";
4986         globals << helperRoutinesCode;
4987 
4988         prologue << prologueCode;
4989         epilogue << epilogueCode;
4990     }
4991 
4992     std::stringstream css, functions, main;
4993     m_program->printCode(functions, main);
4994 
4995     css << header.str();
4996     css << layout.str();
4997     css << globals.str();
4998 
4999     css << functions.str() << "\n\n";
5000 
5001     css << "void main()\n"
5002         << (m_data.isSUCF() ? "[[subgroup_uniform_control_flow]]\n" : "")
5003         << (m_data.testType == TT_MAXIMAL ? "[[maximally_reconverges]]\n" : "") << "{\n";
5004 
5005     css << prologue.str() << "\n";
5006     css << main.str() << "\n\n";
5007     css << epilogue.str() << "\n";
5008 
5009     css << "}\n";
5010 
5011     const vk::ShaderBuildOptions buildOptions(programCollection.usedVulkanVersion, vk::SPIRV_VERSION_1_3, 0u);
5012 
5013     auto &testingShader = programCollection.glslSources.add("test");
5014     switch (m_data.shaderStage)
5015     {
5016     case VK_SHADER_STAGE_COMPUTE_BIT:
5017         testingShader << glu::ComputeSource(css.str()) << buildOptions;
5018         break;
5019     case VK_SHADER_STAGE_FRAGMENT_BIT:
5020         testingShader << glu::FragmentSource(css.str()) << buildOptions;
5021         programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource()) << buildOptions;
5022         programCollection.glslSources.add("aux") << glu::FragmentSource(aux.str()) << buildOptions;
5023         break;
5024     case VK_SHADER_STAGE_VERTEX_BIT:
5025         testingShader << glu::VertexSource(css.str()) << buildOptions;
5026         programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5027         break;
5028     case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
5029         testingShader << glu::TessellationControlSource(css.str()) << buildOptions;
5030         programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource());
5031         programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5032         programCollection.glslSources.add("tese") << glu::TessellationEvaluationSource(genPassThroughTessEvalSource());
5033         break;
5034     case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
5035         testingShader << glu::TessellationEvaluationSource(css.str()) << buildOptions;
5036         programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource());
5037         programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5038         programCollection.glslSources.add("tesc") << glu::TessellationControlSource(genPassThroughTessCtrlSource());
5039         break;
5040     case VK_SHADER_STAGE_GEOMETRY_BIT:
5041         testingShader << glu::GeometrySource(css.str()) << buildOptions;
5042         programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource());
5043         programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5044         break;
5045     default:
5046         DE_ASSERT(0);
5047     }
5048 }
5049 
createInstance(Context & context) const5050 TestInstance *ReconvergenceTestCase::createInstance(Context &context) const
5051 {
5052     switch (m_data.shaderStage)
5053     {
5054     case VK_SHADER_STAGE_COMPUTE_BIT:
5055         return new ReconvergenceTestComputeInstance(context, m_data, m_program, std::move(m_subgroupSizeToMaxLoc));
5056     case VK_SHADER_STAGE_FRAGMENT_BIT:
5057         return new ReconvergenceTestFragmentInstance(context, m_data);
5058     case VK_SHADER_STAGE_VERTEX_BIT:
5059         return new ReconvergenceTestVertexInstance(context, m_data);
5060     case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
5061         return new ReconvergenceTestTessCtrlInstance(context, m_data);
5062     case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
5063         return new ReconvergenceTestTessEvalInstance(context, m_data);
5064     case VK_SHADER_STAGE_GEOMETRY_BIT:
5065         return new ReconvergenceTestGeometryInstance(context, m_data);
5066     default:
5067         DE_ASSERT(false);
5068     }
5069     return nullptr;
5070 }
5071 
iterate(void)5072 tcu::TestStatus ReconvergenceTestComputeInstance::iterate(void)
5073 {
5074     const DeviceInterface &vk            = m_context.getDeviceInterface();
5075     const VkDevice device                = m_context.getDevice();
5076     Allocator &allocator                 = m_context.getDefaultAllocator();
5077     tcu::TestLog &log                    = m_context.getTestContext().getLog();
5078     const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
5079 
5080     const uint32_t invocationStride = m_data.sizeX * m_data.sizeY;
5081 
5082     std::vector<tcu::UVec4> ref;
5083     add_ref<ComputeRandomProgram> program(*m_program);
5084 
5085     uint32_t precalculatedMaxLoc = 0u;
5086     if (auto itPrecalculatedMaxLoc = m_subgroupSizeToMaxLoc.find(m_subgroupSize);
5087         itPrecalculatedMaxLoc != m_subgroupSizeToMaxLoc.end())
5088     {
5089         precalculatedMaxLoc = itPrecalculatedMaxLoc->second;
5090     }
5091     uint32_t maxLoc       = precalculatedMaxLoc ? precalculatedMaxLoc :
5092                                                   program.execute(m_context.getTestContext().getWatchDog(), true,
5093                                                                   m_subgroupSize, 0u, invocationStride, ref, log);
5094     uint32_t shaderMaxLoc = maxLoc;
5095 
5096     // maxLoc is per-invocation. Add one (to make sure no additional writes are done) and multiply by
5097     // the number of invocations
5098     maxLoc++;
5099     maxLoc *= invocationStride;
5100 
5101     // buffer[0] is an input filled with a[i] == i
5102     // buffer[1] is the output
5103     // buffer[2] is the location counts
5104     de::MovePtr<BufferWithMemory> buffers[3];
5105     vk::VkDescriptorBufferInfo bufferDescriptors[3];
5106 
5107     VkDeviceSize sizes[3] = {
5108         invocationStride * sizeof(uint32_t),
5109         maxLoc * sizeof(tcu::UVec4),
5110         invocationStride * sizeof(uint32_t),
5111     };
5112 
5113     for (uint32_t i = 0; i < 3; ++i)
5114     {
5115         if (sizes[i] > limits.maxStorageBufferRange)
5116             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
5117 
5118         try
5119         {
5120             buffers[i] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
5121                 vk, device, allocator,
5122                 makeBufferCreateInfo(sizes[i], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
5123                                                    VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
5124                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
5125         }
5126         catch (tcu::ResourceError &)
5127         {
5128             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5129             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
5130                                    "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
5131         }
5132         bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
5133     }
5134 
5135     void *ptrs[3];
5136     for (uint32_t i = 0; i < 3; ++i)
5137     {
5138         ptrs[i] = buffers[i]->getAllocation().getHostPtr();
5139     }
5140     for (uint32_t i = 0; i < sizes[0] / sizeof(uint32_t); ++i)
5141     {
5142         ((uint32_t *)ptrs[0])[i] = i;
5143     }
5144     deMemset(ptrs[1], 0, (size_t)sizes[1]);
5145     deMemset(ptrs[2], 0, (size_t)sizes[2]);
5146 
5147     vk::DescriptorSetLayoutBuilder layoutBuilder;
5148 
5149     layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_data.shaderStage);
5150     layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_data.shaderStage);
5151     layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_data.shaderStage);
5152 
5153     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
5154 
5155     vk::Unique<vk::VkDescriptorPool> descriptorPool(
5156         vk::DescriptorPoolBuilder()
5157             .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3u)
5158             .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
5159     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
5160 
5161     const VkPushConstantRange pushConstantRange = {
5162         (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
5163         0u,                                     // uint32_t offset;
5164         sizeof(PushConstant)                    // uint32_t size;
5165     };
5166 
5167     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = {
5168         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
5169         DE_NULL,                                       // pNext
5170         (VkPipelineLayoutCreateFlags)0,
5171         1,                          // setLayoutCount
5172         &descriptorSetLayout.get(), // pSetLayouts
5173         1u,                         // pushConstantRangeCount
5174         &pushConstantRange,         // pPushConstantRanges
5175     };
5176 
5177     flushAlloc(vk, device, buffers[0]->getAllocation());
5178     flushAlloc(vk, device, buffers[1]->getAllocation());
5179     flushAlloc(vk, device, buffers[2]->getAllocation());
5180 
5181     const VkPipelineBindPoint bindPoint = VK_PIPELINE_BIND_POINT_COMPUTE;
5182     const Unique<VkShaderModule> shader(createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0));
5183     Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
5184     Move<VkPipeline> pipeline             = createComputePipeline(*pipelineLayout, *shader);
5185     const VkQueue queue                   = m_context.getUniversalQueue();
5186     Move<VkCommandPool> cmdPool     = createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
5187                                                         m_context.getUniversalQueueFamilyIndex());
5188     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
5189 
5190     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
5191     setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(0),
5192                                  VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[0]);
5193     setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(1),
5194                                  VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[1]);
5195     setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(2),
5196                                  VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[2]);
5197     setUpdateBuilder.update(vk, device);
5198 
5199     PushConstant pc{/* pcinvocationStride is initialized with 0, the rest of fields as well */};
5200 
5201     // compute "maxLoc", the maximum number of locations written
5202     beginCommandBuffer(vk, *cmdBuffer, 0u);
5203     vk.cmdBindDescriptorSets(*cmdBuffer, bindPoint, *pipelineLayout, 0u, 1, &*descriptorSet, 0u, DE_NULL);
5204     vk.cmdBindPipeline(*cmdBuffer, bindPoint, *pipeline);
5205     vk.cmdPushConstants(*cmdBuffer, *pipelineLayout, m_data.shaderStage, 0, sizeof(pc), &pc);
5206     vk.cmdDispatch(*cmdBuffer, 1, 1, 1);
5207     endCommandBuffer(vk, *cmdBuffer);
5208 
5209     submitCommandsAndWait(vk, device, queue, cmdBuffer.get());
5210 
5211     invalidateAlloc(vk, device, buffers[1]->getAllocation());
5212     invalidateAlloc(vk, device, buffers[2]->getAllocation());
5213 
5214     // Take the max over all invocations. Add one (to make sure no additional writes are done) and multiply by
5215     // the number of invocations
5216     uint32_t newMaxLoc = 0;
5217     for (uint32_t id = 0; id < invocationStride; ++id)
5218         newMaxLoc = de::max(newMaxLoc, ((uint32_t *)ptrs[2])[id]);
5219     shaderMaxLoc = newMaxLoc;
5220     newMaxLoc++;
5221     newMaxLoc *= invocationStride;
5222 
5223     // If we need more space, reallocate buffers[1]
5224     if (newMaxLoc > maxLoc)
5225     {
5226         maxLoc   = newMaxLoc;
5227         sizes[1] = maxLoc * sizeof(tcu::UVec4);
5228 
5229         if (sizes[1] > limits.maxStorageBufferRange)
5230             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
5231 
5232         try
5233         {
5234             buffers[1] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
5235                 vk, device, allocator,
5236                 makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
5237                                                    VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
5238                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
5239         }
5240         catch (tcu::ResourceError &)
5241         {
5242             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5243             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
5244                                    "Failed device memory allocation " + de::toString(sizes[1]) + " bytes");
5245         }
5246         bufferDescriptors[1] = makeDescriptorBufferInfo(**buffers[1], 0, sizes[1]);
5247         ptrs[1]              = buffers[1]->getAllocation().getHostPtr();
5248 
5249         vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
5250         setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(1),
5251                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[1]);
5252         setUpdateBuilder2.update(vk, device);
5253     }
5254 
5255     // Clear any writes to buffer[1] during the counting pass
5256     deMemset(ptrs[1], 0, (size_t)sizes[1]);
5257     flushAlloc(vk, device, buffers[1]->getAllocation());
5258     // Clear any writes to buffer[2] during the counting pass
5259     deMemset(ptrs[2], 0, (size_t)sizes[2]);
5260     flushAlloc(vk, device, buffers[2]->getAllocation());
5261 
5262     // change invocationStride value in shader
5263     pc.invocationStride = invocationStride;
5264 
5265     // run the actual shader
5266     beginCommandBuffer(vk, *cmdBuffer, 0u);
5267     vk.cmdBindDescriptorSets(*cmdBuffer, bindPoint, *pipelineLayout, 0u, 1, &*descriptorSet, 0u, DE_NULL);
5268     vk.cmdBindPipeline(*cmdBuffer, bindPoint, *pipeline);
5269     vk.cmdPushConstants(*cmdBuffer, *pipelineLayout, m_data.shaderStage, 0, sizeof(pc), &pc);
5270     vk.cmdDispatch(*cmdBuffer, 1, 1, 1);
5271     endCommandBuffer(vk, *cmdBuffer);
5272 
5273     submitCommandsAndWait(vk, device, queue, cmdBuffer.get());
5274 
5275     invalidateAlloc(vk, device, buffers[1]->getAllocation());
5276 
5277     // Simulate execution on the CPU, and compare against the GPU result
5278     try
5279     {
5280         ref.resize(maxLoc, tcu::UVec4());
5281     }
5282     catch (const std::bad_alloc &)
5283     {
5284         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5285         return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
5286                                "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
5287     }
5288 
5289     program.execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize, 0u, invocationStride, ref, log);
5290 
5291     const tcu::UVec4 *result = (const tcu::UVec4 *)ptrs[1];
5292 
5293     qpTestResult res = calculateAndLogResult(result, ref, invocationStride, m_subgroupSize, shaderMaxLoc);
5294 
5295     return tcu::TestStatus(res, qpGetTestResultName(res));
5296 }
5297 
calculateAndLogResult(const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,uint32_t invocationStride,uint32_t subgroupSize,uint32_t shaderMaxLoc)5298 qpTestResult_e ReconvergenceTestComputeInstance::calculateAndLogResult(const tcu::UVec4 *result,
5299                                                                        const std::vector<tcu::UVec4> &ref,
5300                                                                        uint32_t invocationStride, uint32_t subgroupSize,
5301                                                                        uint32_t shaderMaxLoc)
5302 {
5303     const uint32_t maxLoc = static_cast<uint32_t>(ref.size());
5304     tcu::TestLog &log     = m_context.getTestContext().getLog();
5305     qpTestResult res      = QP_TEST_RESULT_PASS;
5306     DE_ASSERT(subgroupSize * shaderMaxLoc <= maxLoc);
5307     DE_UNREF(shaderMaxLoc);
5308 
5309     uint32_t mismatchCount            = 0u;
5310     const uint32_t printMismatchCount = 5u;
5311     if (m_data.testType == TT_MAXIMAL)
5312     {
5313         // With maximal reconvergence, we should expect the output to exactly match
5314         // the reference.
5315         for (uint32_t i = 0; i < maxLoc; ++i)
5316         {
5317             const Ballot resultVal(result[i], subgroupSize);
5318             const Ballot refVal(ref[i], subgroupSize);
5319             if (resultVal != refVal)
5320             {
5321                 res = QP_TEST_RESULT_FAIL;
5322                 if (mismatchCount++ < printMismatchCount)
5323                 {
5324                     log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
5325                         << "\n     got: " << refVal << tcu::TestLog::EndMessage;
5326                 }
5327                 else
5328                     break;
5329             }
5330         }
5331 
5332 #if 0 // This log can be large and slow, ifdef it out by default
5333         log << tcu::TestLog::Message << "subgroupSize:" << subgroupSize << ", invocationStride:" << invocationStride << ", maxLoc:" << shaderMaxLoc << tcu::TestLog::EndMessage;
5334         uint32_t invMax = std::min(invocationStride, 50u);
5335         for (uint32_t inv = 0; inv < invMax; ++inv)
5336         {
5337             auto ll = log << tcu::TestLog::Message;
5338             ll << inv << ": ";
5339             for (uint32_t loc = 0; loc < shaderMaxLoc; ++loc)
5340             {
5341                 uint64_t entry = result[loc * invocationStride + inv];
5342                 ll << de::toString(loc) << ":" << tcu::toHex(entry) << ' ';
5343             }
5344             ll << tcu::TestLog::EndMessage;
5345         }
5346 #endif
5347 
5348         if (res != QP_TEST_RESULT_PASS)
5349         {
5350             for (uint32_t i = 0; i < maxLoc; ++i)
5351             {
5352 #if 0
5353                 // This log can be large and slow, ifdef it out by default
5354                 const Ballot resultVal(result[i], subgroupSize);
5355                 const Ballot refVal(ref[i], subgroupSize);
5356                 log << tcu::TestLog::Message << "result " << i << "(" << (i / invocationStride) << ", " << (i % invocationStride) << "): " << resultVal << " ref " << refVal << (resultVal != refVal ? " different" : "") << tcu::TestLog::EndMessage;
5357 #endif
5358             }
5359         }
5360     }
5361     else
5362     {
5363         DE_ASSERT(subgroupSize != 0);
5364 
5365         Ballot fullMask = subgroupSizeToMask(subgroupSize, 0 /* ignored */);
5366         // For subgroup_uniform_control_flow, we expect any fully converged outputs in the reference
5367         // to have a corresponding fully converged output in the result. So walk through each lane's
5368         // results, and for each reference value of fullMask, find a corresponding result value of
5369         // fullMask where the previous value (OP_STORE) matches. That means these came from the same
5370         // source location.
5371         vector<uint32_t> firstFail(invocationStride, 0);
5372         for (uint32_t lane = 0; lane < invocationStride; ++lane)
5373         {
5374             uint32_t resLoc = lane + invocationStride, refLoc = lane + invocationStride;
5375             while (refLoc < maxLoc)
5376             {
5377                 while (refLoc < maxLoc && ref[refLoc] != fullMask)
5378                     refLoc += invocationStride;
5379                 if (refLoc >= maxLoc)
5380                     break;
5381 
5382                 // For TT_SUCF_ELECT, when the reference result has a full mask, we expect lane 0 to be elected
5383                 // (a value of 2) and all other lanes to be not elected (a value of 1). For TT_SUCF_BALLOT, we
5384                 // expect a full mask. Search until we find the expected result with a matching store value in
5385                 // the previous result.
5386                 Ballot expectedResult = m_data.isElect() ? Ballot((lane % m_subgroupSize) == 0 ? 2 : 1) : fullMask;
5387 
5388                 while (resLoc < maxLoc && !(result[resLoc] == expectedResult &&
5389                                             result[resLoc - invocationStride] == ref[refLoc - invocationStride]))
5390                     resLoc += invocationStride;
5391 
5392                 // If we didn't find this output in the result, flag it as an error.
5393                 if (resLoc >= maxLoc)
5394                 {
5395                     firstFail[lane] = refLoc;
5396                     log << tcu::TestLog::Message << "lane " << lane << " first mismatch at " << firstFail[lane]
5397                         << tcu::TestLog::EndMessage;
5398                     res = QP_TEST_RESULT_FAIL;
5399                     break;
5400                 }
5401                 refLoc += invocationStride;
5402                 resLoc += invocationStride;
5403             }
5404         }
5405 
5406         if (res != QP_TEST_RESULT_PASS)
5407         {
5408             for (uint32_t i = 0; i < maxLoc; ++i)
5409             {
5410                 // This log can be large and slow, ifdef it out by default
5411 #if 0
5412                 log << tcu::TestLog::Message << "result " << i << "(" << (i / invocationStride) << ", " << (i % invocationStride) << "): " << tcu::toHex(result[i]) << " ref " << tcu::toHex(ref[i]) << (i == firstFail[i % invocationStride] ? " first fail" : "") << tcu::TestLog::EndMessage;
5413 #endif
5414             }
5415         }
5416     }
5417 
5418     return res;
5419 }
5420 
makeRenderPassBeginInfo(const VkRenderPass renderPass,const VkFramebuffer framebuffer)5421 VkRenderPassBeginInfo ReconvergenceTestGraphicsInstance::makeRenderPassBeginInfo(const VkRenderPass renderPass,
5422                                                                                  const VkFramebuffer framebuffer)
5423 {
5424     static const VkClearValue clearValue{{{0u, 0u, 0u, 0u}}};
5425     return {
5426         VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, // VkStructureType sType;
5427         nullptr,                                  // const void* pNext;
5428         renderPass,                               // VkRenderPass renderPass;
5429         framebuffer,                              // VkFramebuffer framebuffer;
5430         makeRect2D(m_data.sizeX, m_data.sizeY),   // VkRect2D renderArea;
5431         1u,                                       // uint32_t clearValueCount;
5432         &clearValue                               // const VkClearValue* pClearValues;
5433     };
5434 }
5435 
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)5436 de::MovePtr<BufferWithMemory> ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(
5437     uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
5438 {
5439     uint32_t vertexCount   = cellsHorz * cellsVert;
5440     uint32_t triangleCount = cellsHorz * cellsVert;
5441     switch (topology)
5442     {
5443     case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
5444         vertexCount = triangleCount * 3;
5445         break;
5446     case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
5447         vertexCount = triangleCount - 1 + 3;
5448         break;
5449     case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
5450     case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
5451         triangleCount = vertexCount - 3 + 1;
5452         break;
5453     default:
5454         DE_ASSERT(0);
5455     }
5456 
5457     const DeviceInterface &vk            = m_context.getDeviceInterface();
5458     const VkDevice device                = m_context.getDevice();
5459     Allocator &allocator                 = m_context.getDefaultAllocator();
5460     const VkDeviceSize bufferSize        = VkDeviceSize(vertexCount) * sizeof(Vertex);
5461     const VkBufferUsageFlags bufferUsage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
5462     const VkBufferCreateInfo createInfo  = makeBufferCreateInfo(bufferSize, bufferUsage);
5463     const MemoryRequirement memoryReqs   = (MemoryRequirement::HostVisible | MemoryRequirement::Coherent);
5464     de::MovePtr<BufferWithMemory> buffer(new BufferWithMemory(vk, device, allocator, createInfo, memoryReqs));
5465     Allocation &allocation = buffer->getAllocation();
5466     Vertex *vertices       = static_cast<Vertex *>(allocation.getHostPtr());
5467 
5468     if (VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST == topology)
5469     {
5470         const float stepX = 2.0f / float(cellsHorz);
5471         const float stepY = 2.0f / float(cellsVert);
5472 
5473         uint32_t t = 0;
5474         float y    = -1.0f;
5475         for (uint32_t h = 0; h < cellsVert; ++h)
5476         {
5477             float x        = -1.0f;
5478             const float yy = y + stepY;
5479             for (uint32_t w = 0; w < cellsHorz; ++w)
5480             {
5481                 const float xx = x + stepX;
5482 
5483                 vertices[t++] = {x, yy, 0.f, 0.f};
5484                 vertices[t++] = {((xx + x) / 2.f), y, 0.f, 0.f};
5485                 vertices[t++] = {xx, ((yy + y) / 2.f), 0.f, 0.f};
5486 
5487                 x = xx;
5488             }
5489             y = yy;
5490         }
5491         DE_ASSERT(vertexCount == t);
5492     }
5493     else
5494     {
5495         const uint32_t div = static_cast<uint32_t>(ROUNDUP(triangleCount, 2) / 2);
5496         const float step   = 2.0f / static_cast<float>(div);
5497 
5498         uint32_t t = 0;
5499         float x    = -1.0f;
5500         for (uint32_t i = 0; i < div; ++i)
5501         {
5502             const bool last   = ((div - i) == 1u);
5503             const float xNext = last ? +1.0f : (x + step);
5504 
5505             const Vertex v0{x, +1.0f, 0.0f, 0.0f};
5506             const Vertex v1{xNext, +1.0f, 0.0f, 0.0f};
5507             const Vertex v2{xNext, -1.0f, 0.0f, 0.0f};
5508             const Vertex v3{x, -1.0f, 0.0f, 0.0f};
5509 
5510             if (t == 0)
5511             {
5512                 vertices[0] = v0;
5513                 vertices[1] = v3;
5514                 vertices[2] = v1;
5515 
5516                 t = 3;
5517             }
5518             else
5519             {
5520                 vertices[t++] = v1;
5521             }
5522 
5523             if (!last || !(triangleCount % 2))
5524             {
5525                 vertices[t++] = v2;
5526             }
5527 
5528             x += step;
5529         }
5530         DE_ASSERT(vertexCount == t);
5531     }
5532 
5533     flushAlloc(vk, device, allocation);
5534     return buffer;
5535 }
generateVertices(const uint32_t primitiveCount,const VkPrimitiveTopology topology,const uint32_t patchSize)5536 std::vector<tcu::Vec4> ReconvergenceTestGraphicsInstance::generateVertices(const uint32_t primitiveCount,
5537                                                                            const VkPrimitiveTopology topology,
5538                                                                            const uint32_t patchSize)
5539 {
5540     auto cast     = [](const float f) -> float { return ((f * 2.0f) - 1.0f); };
5541     auto bestRect = [](const uint32_t c) -> std::pair<uint32_t, uint32_t>
5542     {
5543         uint32_t a = 1;
5544         uint32_t b = 1;
5545         do
5546         {
5547             a = a + 1;
5548             b = (c / a) + ((c % a) ? 1 : 0);
5549         } while (a < b);
5550         return {a, b};
5551     };
5552 
5553     uint32_t triangleCount = 0;
5554     uint32_t vertexCount   = 0;
5555     switch (topology)
5556     {
5557     case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
5558         triangleCount = primitiveCount;
5559         vertexCount   = triangleCount + 3 - 1;
5560         break;
5561     case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
5562         triangleCount = primitiveCount;
5563         vertexCount   = triangleCount * 3;
5564         break;
5565     case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
5566         vertexCount = primitiveCount;
5567         break;
5568     case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
5569         vertexCount   = primitiveCount * patchSize;
5570         triangleCount = ROUNDUP(vertexCount, 3) / 3;
5571         break;
5572     default:
5573         DE_ASSERT(false);
5574     }
5575 
5576     if (3 == vertexCount)
5577     {
5578         return {{-1.0f, +1.0f, 0.0f, 1.0f}, {0.0f, -1.0f, 0.0f, 1.0f}, {+1.0f, +1.0f, 0.0f, 1.0f}};
5579     }
5580 
5581     std::vector<tcu::Vec4> vertices(vertexCount);
5582 
5583     if (VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP == topology)
5584     {
5585         uint32_t v         = 0;
5586         const uint32_t div = ROUNDUP(triangleCount, 2) / 2;
5587 
5588         for (uint32_t i = 0; i < triangleCount && v < vertexCount; ++i)
5589         {
5590             const float xx = cast(float((i / 2) + 1) / float(div));
5591             if (0 == i)
5592             {
5593                 const float x = cast(float(i / 2) / float(div));
5594                 vertices[v++] = {x, +1.0f, 0.0f, 1.0f};
5595                 vertices[v++] = {x, -1.0f, 0.0f, 1.0f};
5596                 vertices[v++] = {xx, +1.0f, 0.0f, 1.0f};
5597             }
5598             else
5599             {
5600                 if (i % 2)
5601                     vertices[v++] = {xx, -1.0f, 0.0f, 1.0f};
5602                 else
5603                     vertices[v++] = {xx, +1.0f, 0.0f, 1.0f};
5604             }
5605         }
5606         DE_ASSERT(vertexCount == v);
5607     }
5608     else if (VK_PRIMITIVE_TOPOLOGY_POINT_LIST == topology)
5609     {
5610         uint32_t v      = 0;
5611         const auto rect = bestRect(vertexCount);
5612 
5613         float y = -1.0f;
5614         for (uint32_t h = 0; h < rect.second; ++h)
5615         {
5616             const float yy = cast(float(h + 1) / float(rect.second));
5617             float x        = -1.0f;
5618             for (uint32_t w = 0; w < rect.first && v < vertexCount; ++w)
5619             {
5620                 const float xx = cast(float(w + 1) / float(rect.first));
5621                 vertices[v++]  = {((xx - x) / 2.0f), ((yy - y) / 2.0f), 0.0f, 1.0f};
5622                 x              = xx;
5623             }
5624             y = yy;
5625         }
5626         DE_ASSERT(vertexCount == v);
5627     }
5628     else if (VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST == topology || VK_PRIMITIVE_TOPOLOGY_PATCH_LIST == topology)
5629     {
5630         uint32_t v      = 0;
5631         const auto rect = bestRect(triangleCount);
5632 
5633         float y = -1.0f;
5634         for (uint32_t h = 0; h < rect.second && v < vertexCount; ++h)
5635         {
5636             const float yy = cast(float(h + 1) / float(rect.second));
5637             float x        = -1.0f;
5638             for (uint32_t w = 0; w < rect.first && v < vertexCount; ++w)
5639             {
5640                 const float xx = cast(float(w + 1) / float(rect.first));
5641                 if (v < vertexCount)
5642                     vertices[v++] = {x, yy, 0.f, 0.f};
5643                 if (v < vertexCount)
5644                     vertices[v++] = {((xx + x) / 2.f), y, 0.f, 0.f};
5645                 if (v < vertexCount)
5646                     vertices[v++] = {xx, ((yy + y) / 2.f), 0.f, 0.f};
5647                 x = xx;
5648             }
5649             y = yy;
5650         }
5651         DE_ASSERT(vertexCount == v);
5652     }
5653 
5654     return vertices;
5655 }
5656 
createVertexBufferAndFlush(const std::vector<tcu::Vec4> & vertices)5657 de::MovePtr<BufferWithMemory> ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(
5658     const std::vector<tcu::Vec4> &vertices)
5659 {
5660     const DeviceInterface &vk            = m_context.getDeviceInterface();
5661     const VkDevice device                = m_context.getDevice();
5662     Allocator &allocator                 = m_context.getDefaultAllocator();
5663     const VkDeviceSize bufferSize        = VkDeviceSize(vertices.size()) * sizeof(tcu::Vec4);
5664     const VkBufferUsageFlags bufferUsage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
5665     const VkBufferCreateInfo createInfo  = makeBufferCreateInfo(bufferSize, bufferUsage);
5666     const MemoryRequirement memoryReqs   = (MemoryRequirement::HostVisible | MemoryRequirement::Coherent);
5667     de::MovePtr<BufferWithMemory> buffer(new BufferWithMemory(vk, device, allocator, createInfo, memoryReqs));
5668     Allocation &allocation = buffer->getAllocation();
5669     auto bufferRange       = makeStdBeginEnd<tcu::Vec4>(allocation.getHostPtr(), (uint32_t)vertices.size());
5670     std::copy(vertices.begin(), vertices.end(), bufferRange.first);
5671     flushAlloc(vk, device, allocation);
5672     return buffer;
5673 }
5674 
recordDrawingAndSubmit(const VkCommandBuffer cmdBuffer,const VkPipelineLayout pipelineLayout,const VkPipeline pipeline,const VkDescriptorSet descriptorSet,const PushConstant & pushConstant,const VkRenderPassBeginInfo & renderPassInfo,const VkBuffer vertexBuffer,const uint32_t vertexCount,const VkImage image)5675 void ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit(
5676     const VkCommandBuffer cmdBuffer, const VkPipelineLayout pipelineLayout, const VkPipeline pipeline,
5677     const VkDescriptorSet descriptorSet, const PushConstant &pushConstant, const VkRenderPassBeginInfo &renderPassInfo,
5678     const VkBuffer vertexBuffer, const uint32_t vertexCount, const VkImage image)
5679 {
5680     DE_UNREF(image);
5681     const DeviceInterface &vk           = m_context.getDeviceInterface();
5682     const VkDevice device               = m_context.getDevice();
5683     const VkQueue queue                 = m_context.getUniversalQueue();
5684     const VkPipelineBindPoint bindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
5685 
5686     beginCommandBuffer(vk, cmdBuffer, 0u);
5687     vk.cmdBindDescriptorSets(cmdBuffer, bindPoint, pipelineLayout, 0u, 1u, &descriptorSet, 0u, DE_NULL);
5688     vk.cmdBindPipeline(cmdBuffer, bindPoint, pipeline);
5689     vk.cmdBindVertexBuffers(cmdBuffer, 0u, 1u, &static_cast<const VkBuffer &>(vertexBuffer),
5690                             &static_cast<const VkDeviceSize &>(0u));
5691     vk.cmdPushConstants(cmdBuffer, pipelineLayout, m_data.shaderStage, 0, sizeof(PushConstant), &pushConstant);
5692     vk.cmdBeginRenderPass(cmdBuffer, &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE);
5693     vk.cmdDraw(cmdBuffer, vertexCount, 1u, 0u, 0u);
5694     vk.cmdEndRenderPass(cmdBuffer);
5695     endCommandBuffer(vk, cmdBuffer);
5696 
5697     submitCommandsAndWait(vk, device, queue, cmdBuffer);
5698 }
5699 
createShaders(void)5700 std::vector<Move<VkShaderModule>> ReconvergenceTestFragmentInstance::createShaders(void)
5701 {
5702     const DeviceInterface &vk = m_context.getDeviceInterface();
5703     const VkDevice device     = m_context.getDevice();
5704 
5705     Move<VkShaderModule> vertex   = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"), 0);
5706     Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0);
5707 
5708     // { #vert, #frag, tesc, tese, geom }; if any
5709     std::vector<Move<VkShaderModule>> shaders;
5710     shaders.emplace_back(vertex);
5711     shaders.emplace_back(fragment);
5712 
5713     return shaders;
5714 }
5715 
calculateAndLogResult(const uint64_t * result,const std::vector<uint64_t> & ref,uint32_t invocationStride,uint32_t subgroupSize,uint32_t shaderMaxLocs,uint32_t primitiveCount,PrintMode printMode)5716 qpTestResult_e ReconvergenceTestGraphicsInstance::calculateAndLogResult(const uint64_t *result,
5717                                                                         const std::vector<uint64_t> &ref,
5718                                                                         uint32_t invocationStride,
5719                                                                         uint32_t subgroupSize, uint32_t shaderMaxLocs,
5720                                                                         uint32_t primitiveCount, PrintMode printMode)
5721 {
5722     DE_ASSERT(m_data.testType == TT_MAXIMAL);
5723 
5724     const uint32_t maxLoc  = static_cast<uint32_t>(ref.size());
5725     tcu::TestLog &log      = m_context.getTestContext().getLog();
5726     qpTestResult res       = QP_TEST_RESULT_PASS;
5727     uint32_t mismatchCount = 0;
5728 
5729     DE_ASSERT(shaderMaxLocs * invocationStride <= maxLoc);
5730 
5731     // With maximal reconvergence, we should expect the output to exactly match
5732     // the reference.
5733     for (uint32_t i = 0; i < maxLoc; ++i)
5734     {
5735         const uint64_t resultVal = result[i];
5736         const uint64_t refVal    = ref[i];
5737         if (resultVal != refVal)
5738         {
5739             if (1 > mismatchCount++)
5740             {
5741                 log << tcu::TestLog::Message << mismatchCount << ": Mismatch at " << i
5742                     << ", res: " << tcu::toHex(resultVal) << ", ref: " << tcu::toHex(refVal)
5743                     << tcu::TestLog::EndMessage;
5744             }
5745         }
5746     }
5747 
5748     if (PrintMode::None != printMode)
5749     {
5750         log << tcu::TestLog::Message << "deviceSubgroupSize: " << m_subgroupSize
5751             << ", testSubgroupSize: " << subgroupSize << ", invocationStride: " << invocationStride
5752             << ", shaderMaxLocs: " << shaderMaxLocs << "\n\t, framebuffer: " << m_data.sizeX << 'x' << m_data.sizeY
5753             << ", primitiveCount: " << primitiveCount << ", PRINT_MODE: "
5754             << ((PrintMode::ThreadsInColumns == printMode) ?
5755                     "\"ouLocs in rows & threads in columns\"" :
5756                     ((PrintMode::OutLocsInColumns == printMode) ? "\"threads in rows & outLocs in columns\"" : ""))
5757             << " { id:res,ref }\n"
5758             << tcu::TestLog::EndMessage;
5759     }
5760 
5761     uint32_t invMax = std::min(invocationStride, 80u);
5762 
5763     if (PrintMode::ThreadsInColumns == printMode)
5764     {
5765         for (uint32_t loc = 0; loc < shaderMaxLocs; ++loc)
5766         {
5767             auto l1 = log << tcu::TestLog::Message;
5768             l1 << "loc " << std::setw(3) << loc << ": ";
5769             for (uint32_t inv = 0; inv < invMax; ++inv)
5770             {
5771                 uint32_t idx = loc * invocationStride + inv;
5772                 DE_ASSERT(idx < maxLoc);
5773                 uint64_t resEntry = result[idx];
5774                 uint64_t refEntry = ref[idx];
5775                 //l1 << de::toString(inv) << ':' << tcu::toHex(resEntry) << ',' << tcu::toHex(refEntry) << ' ';
5776                 l1 << std::dec << inv << ':' << std::setw(subgroupSize / 4) << std::hex << resEntry << ','
5777                    << std::setw(subgroupSize / 4) << std::hex << refEntry << std::dec << ' ';
5778             }
5779             l1 << std::setw(0) << tcu::TestLog::EndMessage;
5780         }
5781     }
5782     else if (PrintMode::OutLocsInColumns == printMode)
5783     {
5784         for (uint32_t inv = 0; inv < invMax; ++inv)
5785         {
5786             auto l1 = log << tcu::TestLog::Message;
5787             l1 << "res " << std::setw(3) << inv << ": ";
5788             for (uint32_t loc = 0; loc < shaderMaxLocs; ++loc)
5789             {
5790                 uint32_t idx = loc * invocationStride + inv;
5791                 DE_ASSERT(idx < maxLoc);
5792                 uint64_t entry = result[idx];
5793                 l1 << de::toString(loc) << ':' << tcu::toHex(entry) << ' ';
5794             }
5795             l1 << std::setw(0) << tcu::TestLog::EndMessage;
5796 
5797             auto l2 = log << tcu::TestLog::Message;
5798             l2 << "ref " << std::setw(3) << inv << ": ";
5799             for (uint32_t loc = 0; loc < shaderMaxLocs; ++loc)
5800             {
5801                 uint32_t idx = loc * invocationStride + inv;
5802                 DE_ASSERT(idx < maxLoc);
5803                 uint64_t entry = ref[idx];
5804                 l2 << de::toString(loc) << ':' << tcu::toHex(entry) << ' ';
5805             }
5806             l2 << std::setw(0) << tcu::TestLog::EndMessage;
5807         }
5808     }
5809 
5810     if (mismatchCount)
5811     {
5812         double mismatchPercentage = 0.0;
5813         std::modf((double)(mismatchCount * 100) / (double)maxLoc, &mismatchPercentage);
5814         log << tcu::TestLog::Message << "Mismatch count " << mismatchCount << " from " << maxLoc << " ("
5815             << mismatchPercentage << "%)" << tcu::TestLog::EndMessage;
5816         res = QP_TEST_RESULT_FAIL;
5817     }
5818 
5819     if (res != QP_TEST_RESULT_PASS)
5820     {
5821         for (uint32_t i = 0; i < maxLoc; ++i)
5822         {
5823             // This log can be large and slow, ifdef it out by default
5824 #if 0
5825             log << tcu::TestLog::Message << "result " << i << "(" << (i / invocationStride) << ", " << (i % invocationStride) << "): " << tcu::toHex(result[i]) << " ref " << tcu::toHex(ref[i]) << (result[i] != ref[i] ? " different" : "") << tcu::TestLog::EndMessage;
5826 #endif
5827         }
5828     }
5829 
5830     return res;
5831 }
5832 
calculateAndLogResultEx(tcu::TestLog & log,const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,const uint32_t maxLoc,const Arrangement & a,const PrintMode printMode)5833 qpTestResult_e ReconvergenceTestFragmentInstance::calculateAndLogResultEx(tcu::TestLog &log, const tcu::UVec4 *result,
5834                                                                           const std::vector<tcu::UVec4> &ref,
5835                                                                           const uint32_t maxLoc, const Arrangement &a,
5836                                                                           const PrintMode printMode)
5837 {
5838     DE_UNREF(printMode);
5839 
5840     qpTestResult res                             = QP_TEST_RESULT_PASS;
5841     uint32_t mismatchCount                       = 0u;
5842     const uint32_t printMismatchCount            = 5u;
5843     const FragmentRandomProgram::Arrangement &aa = static_cast<const FragmentRandomProgram::Arrangement &>(a);
5844 
5845     // With maximal reconvergence, we should expect the output to exactly match
5846     // the reference.
5847     const uint32_t ballotStoreCount = maxLoc * aa.m_invocationStride * aa.m_primitiveStride;
5848     for (uint32_t i = 0; i < ballotStoreCount; ++i)
5849     {
5850         const Ballot resultVal(result[i], aa.m_subgroupSize);
5851         ;
5852         const Ballot refVal(ref[i], aa.m_subgroupSize);
5853         if (resultVal != refVal)
5854         {
5855             if (mismatchCount++ < printMismatchCount)
5856             {
5857                 res = QP_TEST_RESULT_FAIL;
5858                 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
5859                     << "\n     got: " << refVal << tcu::TestLog::EndMessage;
5860                 if (printMode == PrintMode::Console)
5861                 {
5862                     std::cout << "Mismatch at " << i << "\nexpected: " << resultVal << "\n     got: " << refVal
5863                               << std::endl;
5864                 }
5865             }
5866         }
5867     }
5868 
5869     log << tcu::TestLog::Message << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount
5870         << tcu::TestLog::EndMessage;
5871     if (printMode == PrintMode::Console)
5872     {
5873         std::cout << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount << std::endl;
5874     }
5875 
5876     return res;
5877 }
5878 
makeImageCreateInfo(VkFormat format) const5879 VkImageCreateInfo ReconvergenceTestFragmentInstance::makeImageCreateInfo(VkFormat format) const
5880 {
5881     return {
5882         VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
5883         nullptr,                             // const void* pNext;
5884         VkImageCreateFlags(0),               // VkImageCreateFlags flags;
5885         VK_IMAGE_TYPE_2D,                    // VkImageType imageType;
5886         format,                              // VkFormat format;
5887         {m_data.sizeX, m_data.sizeY, 1u},    // VkExtent3D extent;
5888         1u,                                  // uint32_t mipLevels;
5889         1u,                                  // uint32_t arrayLayers;
5890         VK_SAMPLE_COUNT_1_BIT,               // VkSampleCountFlagBits samples;
5891         VK_IMAGE_TILING_OPTIMAL,             // VkImageTiling tiling;
5892         VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
5893         VK_SHARING_MODE_EXCLUSIVE,           // VkSharingMode sharingMode;
5894         0u,                                  // uint32_t queueFamilyIndexCount;
5895         0u,                                  // const uint32_t* pQueueFamilyIndices;
5896         VK_IMAGE_LAYOUT_UNDEFINED            // VkImageLayout initialLayout;
5897     };
5898 }
5899 
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)5900 de::MovePtr<BufferWithMemory> ReconvergenceTestFragmentInstance::createVertexBufferAndFlush(
5901     uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
5902 {
5903     // DE_ASSERT(cellsHorz == 2u);
5904     // DE_ASSERT((cellsHorz * 3) == cellsVert);
5905     DE_UNREF(cellsHorz);
5906     DE_UNREF(cellsVert);
5907     DE_UNREF(topology);
5908     DE_ASSERT(topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST);
5909     const std::vector<tcu::Vec4> vertices{{-1.0f, 0.0f, 0.0f, 0.0f},  {-0.5f, -1.0f, 0.0f, 0.0f},
5910                                           {+1.0f, +1.0f, 0.0f, 0.0f}, {+0.5f, -1.0f, 0.0f, 0.0f},
5911                                           {+1.0f, 0.0f, 0.0f, 0.0f},  {-1.0f, +1.0f, 0.0f, 0.0f}};
5912     return ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(vertices);
5913 }
5914 
callAuxiliaryShader(tcu::TestStatus & status,uint32_t triangleCount)5915 std::vector<uint32_t> ReconvergenceTestFragmentInstance::callAuxiliaryShader(tcu::TestStatus &status,
5916                                                                              uint32_t triangleCount)
5917 {
5918     const DeviceInterface &vk    = m_context.getDeviceInterface();
5919     const VkDevice device        = m_context.getDevice();
5920     add_ref<Allocator> allocator = m_context.getDefaultAllocator();
5921     const uint32_t queueIndex    = m_context.getUniversalQueueFamilyIndex();
5922     //add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
5923     const uint32_t bufferElems    = m_data.sizeX * m_data.sizeY * triangleCount + 3u;
5924     const VkDeviceSize bufferSize = bufferElems * sizeof(uint32_t);
5925 
5926     if (bufferSize > m_context.getDeviceProperties().limits.maxStorageBufferRange)
5927         TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
5928 
5929     const VkBufferCreateInfo createInfo =
5930         vk::makeBufferCreateInfo(bufferSize, (VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
5931                                               VK_BUFFER_USAGE_TRANSFER_SRC_BIT));
5932     de::MovePtr<BufferWithMemory> buffer;
5933     try
5934     {
5935         buffer = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
5936             vk, device, allocator, createInfo, (MemoryRequirement::HostVisible | MemoryRequirement::Coherent)));
5937     }
5938     catch (tcu::ResourceError &)
5939     {
5940         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5941         status = tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
5942                                  "Failed device memory allocation " + de::toString(bufferSize) + " bytes");
5943         return {};
5944     }
5945 
5946     const VkDescriptorBufferInfo bufferInfo = makeDescriptorBufferInfo(**buffer, 0, bufferSize);
5947 
5948     vk::DescriptorSetLayoutBuilder layoutBuilder;
5949     layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_FRAGMENT_BIT);
5950     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
5951 
5952     vk::DescriptorPoolBuilder poolBuilder;
5953     poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1u);
5954     vk::Unique<vk::VkDescriptorPool> descriptorPool(
5955         poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
5956 
5957     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
5958 
5959     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
5960     setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(0),
5961                                  VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferInfo);
5962     setUpdateBuilder.update(vk, device);
5963 
5964     const VkPushConstantRange pushConstantRange{
5965         VK_SHADER_STAGE_FRAGMENT_BIT, // VkShaderStageFlags stageFlags;
5966         0u,                           // uint32_t offset;
5967         sizeof(PushConstant)          // uint32_t size;
5968     };
5969     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
5970         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
5971         DE_NULL,                                       // pNext
5972         (VkPipelineLayoutCreateFlags)0,                // flags
5973         1u,                                            // setLayoutCount
5974         &descriptorSetLayout.get(),                    // pSetLayouts
5975         1u,                                            // pushConstantRangeCount
5976         &pushConstantRange,                            // pPushConstantRanges
5977     };
5978 
5979     const VkFormat format                   = VK_FORMAT_R8G8B8A8_UNORM;
5980     const VkImageCreateInfo imageCreateInfo = makeImageCreateInfo(format);
5981     const VkImageSubresourceRange rscRange  = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
5982     de::MovePtr<ImageWithMemory> image(
5983         new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
5984     Move<VkImageView> view        = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
5985     Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
5986     Move<VkFramebuffer> framebuffer =
5987         makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
5988     const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
5989     auto createAuxShaders                       = [&]()
5990     {
5991         Shaders shaders;
5992         auto vert = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"), 0);
5993         auto frag = createShaderModule(vk, device, m_context.getBinaryCollection().get("aux"), 0);
5994         shaders.emplace_back(vert);
5995         shaders.emplace_back(frag);
5996         return shaders;
5997     };
5998     const Shaders shaders      = createAuxShaders();
5999     const uint32_t vertexCount = triangleCount * 3u;
6000     de::MovePtr<BufferWithMemory> vertexBuffer =
6001         createVertexBufferAndFlush(triangleCount, vertexCount, VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST);
6002     Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6003     Move<VkPipeline> pipeline = createGraphicsPipeline(*pipelineLayout, *renderPass, m_data.sizeX, m_data.sizeY,
6004                                                        shaders, VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, 0U);
6005     Move<VkCommandPool> cmdPool =
6006         createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6007     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6008 
6009     PushConstant pc{};
6010     pc.invocationStride = 0u;
6011     pc.width            = m_data.sizeX;
6012     pc.height           = m_data.sizeY;
6013     pc.primitiveStride  = triangleCount;
6014 
6015     void *ptr        = buffer->getAllocation().getHostPtr();
6016     auto bufferRange = makeStdBeginEnd<uint32_t>(ptr, bufferElems);
6017     std::fill(bufferRange.first, bufferRange.second, 0u);
6018 
6019     std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this, *cmdBuffer, *pipelineLayout, *pipeline,
6020               *descriptorSet, std::cref(pc), std::cref(renderBeginInfo), **vertexBuffer, vertexCount, **image)();
6021 
6022     status = tcu::TestStatus::pass(std::string());
6023     return std::vector<uint32_t>(bufferRange.first, bufferRange.second);
6024 }
6025 
iterate(void)6026 tcu::TestStatus ReconvergenceTestFragmentInstance::iterate(void)
6027 {
6028     const DeviceInterface &vk            = m_context.getDeviceInterface();
6029     const VkDevice device                = m_context.getDevice();
6030     add_ref<Allocator> allocator         = m_context.getDefaultAllocator();
6031     const uint32_t queueIndex            = m_context.getUniversalQueueFamilyIndex();
6032     add_ref<tcu::TestLog> log            = m_context.getTestContext().getLog();
6033     const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
6034     const uint32_t fragmentStride        = m_data.sizeX * m_data.sizeY;
6035     const uint32_t primitiveStride       = 2;
6036 
6037     if (sizeof(PushConstant) > limits.maxPushConstantsSize)
6038     {
6039         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6040                                "PushConstant size " + std::to_string(sizeof(PushConstant)) + " exceeds device limit " +
6041                                    std::to_string(limits.maxPushConstantsSize));
6042     }
6043 
6044     tcu::TestStatus auxStatus(QP_TEST_RESULT_FAIL, std::string());
6045     std::vector<uint32_t> primitiveMap = callAuxiliaryShader(auxStatus, primitiveStride);
6046     if (auxStatus.isFail())
6047         return auxStatus;
6048 
6049     const uint32_t shaderSubgroupSize = primitiveMap.at(fragmentStride * primitiveStride + 1u);
6050     if (shaderSubgroupSize != m_subgroupSize)
6051     {
6052         return tcu::TestStatus(QP_TEST_RESULT_FAIL,
6053                                "The size of the subgroup from the shader (" + std::to_string(shaderSubgroupSize) +
6054                                    ") is different from the size of the subgroup from the device (" +
6055                                    std::to_string(m_subgroupSize) + ")");
6056     }
6057     const uint32_t shaderSubgroupStride = primitiveMap.at(fragmentStride * primitiveStride + 0u);
6058     const uint32_t hostSubgroupStride =
6059         FragmentRandomProgram::Arrangement::calcSubgroupCount(primitiveMap, primitiveStride, fragmentStride);
6060     if (shaderSubgroupStride != hostSubgroupStride)
6061     {
6062         return tcu::TestStatus(QP_TEST_RESULT_FAIL,
6063                                "The number of subgroups from the shader (" + std::to_string(shaderSubgroupStride) +
6064                                    ") is different from the number of subgroups calculated manually (" +
6065                                    std::to_string(hostSubgroupStride) + ")");
6066     }
6067 
6068     log << tcu::TestLog::Message << "Subgroup count: " << hostSubgroupStride << tcu::TestLog::EndMessage;
6069     log << tcu::TestLog::Message << "Subgroup size: " << m_subgroupSize << tcu::TestLog::EndMessage;
6070 
6071     const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
6072     de::MovePtr<BufferWithMemory> vertexBuffer =
6073         createVertexBufferAndFlush(primitiveStride, (primitiveStride * 3u), topology);
6074 
6075     std::vector<tcu::UVec4> ref;
6076     de::MovePtr<FragmentRandomProgram> program = FragmentRandomProgram::create(m_data);
6077     program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
6078 
6079     const uint32_t simulationMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), true, m_subgroupSize,
6080                                                        fragmentStride, primitiveStride, ref, log, primitiveMap);
6081     log << tcu::TestLog::Message << "simulated maxLoc: " << simulationMaxLoc << tcu::TestLog::EndMessage;
6082     // maxLoc is per-invocation. Add one (to make sure no additional writes are done)
6083     uint32_t maxLoc = simulationMaxLoc;
6084     maxLoc += 1;
6085     maxLoc *= (hostSubgroupStride * 128u * primitiveStride);
6086 
6087     constexpr uint32_t bufferCount = 4;
6088     enum Bindings
6089     {
6090         InputA,
6091         OutputBallots,
6092         OutputCounts,
6093         OutputPriMap
6094     };
6095 
6096     de::MovePtr<BufferWithMemory> buffers[bufferCount];
6097     vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
6098 
6099     VkDeviceSize sizes[bufferCount]{
6100         // InputA  { uint    a[]; } inputA;  filled with a[i] := i
6101         (FragmentRandomProgram::conditionIfInvocationStride + 2) * sizeof(uint32_t),
6102 
6103         // OutputB { uvec4   b[]; } outputB;
6104         maxLoc * sizeof(tcu::UVec4),
6105 
6106         // OutputC { uint loc[]; } outputC;
6107         (hostSubgroupStride * 128u * primitiveStride) * sizeof(uint32_t),
6108 
6109         // OutputP { uvec   p[]; } outputP; few more for calculating subgroupID, subgroupSize, non-helper and helperinvocations
6110         (fragmentStride * primitiveStride + 16u) * sizeof(uint32_t)};
6111 
6112     VkBufferUsageFlags usages[bufferCount]{
6113         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6114         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6115         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6116         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6117     };
6118 
6119     // allocate buffers
6120     for (uint32_t i = 0; i < bufferCount; ++i)
6121     {
6122         if (sizes[i] > limits.maxStorageBufferRange)
6123             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6124 
6125         try
6126         {
6127             buffers[i] = de::MovePtr<BufferWithMemory>(
6128                 new BufferWithMemory(vk, device, allocator,
6129                                      makeBufferCreateInfo(sizes[i], usages[i] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6130                                                                         VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6131                                      MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6132         }
6133         catch (tcu::ResourceError &)
6134         {
6135             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6136             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6137                                    "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
6138         }
6139         bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
6140     }
6141 
6142     // get raw pointers to previously allocated buffers
6143     void *ptrs[bufferCount];
6144     for (uint32_t i = 0; i < bufferCount; ++i)
6145     {
6146         ptrs[i] = buffers[i]->getAllocation().getHostPtr();
6147     }
6148 
6149     // populate buffers with their destination
6150     {
6151         auto rangeBufferA =
6152             makeStdBeginEnd<uint32_t>(ptrs[InputA], static_cast<uint32_t>(sizes[InputA] / sizeof(uint32_t)));
6153         std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
6154     }
6155     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6156     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6157     deMemset(ptrs[OutputPriMap], 0, (size_t)sizes[OutputPriMap]);
6158 
6159     // (...) and flush them to the GPU
6160     for (uint32_t i = 0; i < bufferCount; ++i)
6161     {
6162         flushAlloc(vk, device, buffers[i]->getAllocation());
6163     }
6164 
6165     VkDescriptorType descTypes[bufferCount]{
6166         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6167         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6168         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6169         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6170     };
6171 
6172     vk::DescriptorSetLayoutBuilder layoutBuilder;
6173     for (uint32_t i = 0; i < bufferCount; ++i)
6174     {
6175         layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
6176     }
6177     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
6178 
6179     vk::DescriptorPoolBuilder poolBuilder;
6180     for (uint32_t i = 0; i < bufferCount; ++i)
6181     {
6182         poolBuilder.addType(descTypes[i], 1);
6183     }
6184     vk::Unique<vk::VkDescriptorPool> descriptorPool(
6185         poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
6186     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
6187 
6188     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
6189     for (uint32_t i = 0; i < bufferCount; ++i)
6190     {
6191         setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
6192                                      &bufferDescriptors[i]);
6193     }
6194     setUpdateBuilder.update(vk, device);
6195 
6196     const VkPushConstantRange pushConstantRange{
6197         (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
6198         0u,                                     // uint32_t offset;
6199         sizeof(PushConstant)                    // uint32_t size;
6200     };
6201 
6202     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
6203         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
6204         DE_NULL,                                       // pNext
6205         (VkPipelineLayoutCreateFlags)0,                // flags
6206         1u,                                            // setLayoutCount
6207         &descriptorSetLayout.get(),                    // pSetLayouts
6208         1u,                                            // pushConstantRangeCount
6209         &pushConstantRange,                            // pPushConstantRanges
6210     };
6211 
6212     const VkFormat format                   = VK_FORMAT_R8G8B8A8_UNORM;
6213     const VkImageCreateInfo imageCreateInfo = makeImageCreateInfo(format);
6214     const VkImageSubresourceRange rscRange  = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
6215     de::MovePtr<ImageWithMemory> image(
6216         new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
6217     Move<VkImageView> view        = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
6218     Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
6219     Move<VkFramebuffer> framebuffer =
6220         makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
6221     const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
6222     const Shaders shaders                       = createShaders();
6223     Move<VkPipelineLayout> pipelineLayout       = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6224     Move<VkPipeline> pipeline =
6225         createGraphicsPipeline(*pipelineLayout, *renderPass, m_data.sizeX, m_data.sizeY, shaders, topology, 0U);
6226     Move<VkCommandPool> cmdPool =
6227         createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6228     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6229 
6230     PushConstant pc{};
6231     pc.width                 = m_data.sizeX;
6232     pc.height                = m_data.sizeY;
6233     pc.primitiveStride       = primitiveStride;
6234     pc.invocationStride      = 0u;
6235     pc.subgroupStride        = hostSubgroupStride;
6236     pc.enableInvocationIndex = VK_FALSE;
6237 
6238     auto callRecordDrawingAndSubmit = std::bind(
6239         &ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this, *cmdBuffer, *pipelineLayout, *pipeline,
6240         *descriptorSet, std::cref(pc), std::cref(renderBeginInfo), **vertexBuffer, (primitiveStride * 3u), **image);
6241 
6242     // compute "maxLoc", which is a potential maximum number of locations written
6243     callRecordDrawingAndSubmit();
6244 
6245     // Take the maximum of "maxLoc" over all invocations.
6246     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6247     auto rangeLoc = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], (hostSubgroupStride * 128u * primitiveStride));
6248     const uint32_t computedShaderMaxLoc = *max_element(rangeLoc.first, rangeLoc.second);
6249     log << tcu::TestLog::Message << "Computed maxLoc in the shader: " << computedShaderMaxLoc
6250         << tcu::TestLog::EndMessage;
6251 
6252     if (computedShaderMaxLoc >= FragmentRandomProgram::experimentalOutLocSize)
6253     {
6254         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6255                                "Calculated maxLoc from a shader (which is " + de::toString(computedShaderMaxLoc) +
6256                                    ") "
6257                                    "exceeds BALLOT_STACK_SIZE (which is " +
6258                                    de::toString(FragmentRandomProgram::experimentalOutLocSize) +
6259                                    ").\n"
6260                                    "To repair this just increment slightly a " MAKETEXT(
6261                                        FragmentRandomProgram::experimentalOutLocSize) " "
6262                                                                                       "in line " +
6263                                    de::toString(BALLOT_STACK_SIZE_DEFVAL_LINE));
6264     }
6265 
6266     // If we need more space, reallocate OutputB::b[]
6267     if (computedShaderMaxLoc != simulationMaxLoc)
6268     {
6269         // Add one (to make sure no additional writes are done) and multiply by
6270         // the number of invocations and current primitive count
6271         maxLoc = (std::max(computedShaderMaxLoc, simulationMaxLoc) + 1) * (hostSubgroupStride * 128u * primitiveStride);
6272         sizes[OutputBallots] = maxLoc * sizeof(tcu::UVec4);
6273 
6274         if (sizes[OutputBallots] > limits.maxStorageBufferRange)
6275             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6276 
6277         try
6278         {
6279             buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
6280                 vk, device, allocator,
6281                 makeBufferCreateInfo(sizes[OutputBallots], usages[OutputBallots] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6282                                                                VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6283                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6284         }
6285         catch (tcu::ResourceError &)
6286         {
6287             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6288             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6289                                    "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6290         }
6291         bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
6292         ptrs[OutputBallots]              = buffers[OutputBallots]->getAllocation().getHostPtr();
6293 
6294         vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
6295         setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
6296                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
6297         setUpdateBuilder2.update(vk, device);
6298     }
6299 
6300     // Clear any writes to ballots/stores OutputB::b[] aka buffer[OutputBallots] during the counting pass
6301     // Note that its size would may change since the first memory allocation
6302     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6303     // Clear any writes to counting OutputC::loc[] aka buffer[OutputCounts] during the counting pass
6304     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6305     // Clear any writes to counting OutputP::p[] aka buffer[OutputPriMap] during the counting pass
6306     deMemset(ptrs[OutputPriMap], 0, (size_t)sizes[OutputPriMap]);
6307 
6308     // flush them all to the GPU
6309     flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6310     flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6311     flushAlloc(vk, device, buffers[OutputPriMap]->getAllocation());
6312 
6313     // run the actual shader with updated PushConstant
6314     pc.enableInvocationIndex = VK_TRUE;
6315     callRecordDrawingAndSubmit();
6316 
6317     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6318     invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6319     invalidateAlloc(vk, device, buffers[OutputPriMap]->getAllocation());
6320 
6321     // Simulate execution on the CPU, and compare against the GPU result
6322     try
6323     {
6324         ref.resize(maxLoc, tcu::UVec4());
6325     }
6326     catch (const std::bad_alloc &)
6327     {
6328         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6329         return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
6330                                "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
6331     }
6332 
6333     std::fill(primitiveMap.begin(), primitiveMap.end(), 0);
6334     auto primitiveMapRange = makeStdBeginEnd<const uint32_t>(ptrs[OutputPriMap], (fragmentStride * primitiveStride));
6335     std::copy(primitiveMapRange.first, primitiveMapRange.second, primitiveMap.begin());
6336 
6337     const FragmentRandomProgram::Arrangement a(primitiveMap, m_data.sizeX, m_data.sizeY, m_subgroupSize,
6338                                                primitiveStride);
6339     const tcu::UVec4 *ballots = static_cast<tcu::UVec4 *>(ptrs[OutputBallots]);
6340 
6341     program->execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize, fragmentStride, primitiveStride,
6342                      ref, log, primitiveMap, ballots);
6343 
6344     const uint32_t finalMaxLoc = std::max(computedShaderMaxLoc, simulationMaxLoc);
6345     const qpTestResult res     = calculateAndLogResultEx(log, ballots, ref, finalMaxLoc, a, PrintMode::None);
6346 
6347     return tcu::TestStatus(res, qpGetTestResultName(res));
6348 }
6349 
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)6350 de::MovePtr<BufferWithMemory> ReconvergenceTestVertexInstance::createVertexBufferAndFlush(uint32_t cellsHorz,
6351                                                                                           uint32_t cellsVert,
6352                                                                                           VkPrimitiveTopology topology)
6353 {
6354     DE_UNREF(topology);
6355     DE_ASSERT(VK_PRIMITIVE_TOPOLOGY_POINT_LIST == topology);
6356     const std::vector<tcu::Vec4> vertices =
6357         VertexRandomProgram::Arrangement::generatePrimitives(cellsHorz, cellsVert, VertexRandomProgram::fillPercentage);
6358     return ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(vertices);
6359 }
6360 
createShaders(void)6361 std::vector<Move<VkShaderModule>> ReconvergenceTestVertexInstance::createShaders(void)
6362 {
6363     const DeviceInterface &vk = m_context.getDeviceInterface();
6364     const VkDevice device     = m_context.getDevice();
6365 
6366     Move<VkShaderModule> vertex   = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0);
6367     Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"), 0);
6368 
6369     // { #vert, #frag, #tesc, tese, geom }; if any
6370     std::vector<Move<VkShaderModule>> shaders;
6371     shaders.emplace_back(vertex);
6372     shaders.emplace_back(fragment);
6373 
6374     return shaders;
6375 }
6376 
iterate(void)6377 tcu::TestStatus ReconvergenceTestVertexInstance::iterate(void)
6378 {
6379     const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
6380     if (sizeof(PushConstant) > limits.maxPushConstantsSize)
6381     {
6382         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6383                                "PushConstant size " + std::to_string(sizeof(PushConstant)) + " exceeds device limit " +
6384                                    std::to_string(limits.maxPushConstantsSize));
6385     }
6386 
6387     const DeviceInterface &vk          = m_context.getDeviceInterface();
6388     const VkDevice device              = m_context.getDevice();
6389     Allocator &allocator               = m_context.getDefaultAllocator();
6390     const uint32_t queueIndex          = m_context.getUniversalQueueFamilyIndex();
6391     add_ref<tcu::TestLog> log          = m_context.getTestContext().getLog();
6392     const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
6393     const uint32_t fragmentStride      = uint32_t(m_data.sizeX * m_data.sizeY);
6394     const uint32_t invocationStride =
6395         static_cast<uint32_t>(VertexRandomProgram::Arrangement::generatePrimitives(m_data.sizeX, m_data.sizeY,
6396                                                                                    VertexRandomProgram::fillPercentage)
6397                                   .size());
6398 
6399     de::MovePtr<VertexRandomProgram> program(new VertexRandomProgram(m_data));
6400     program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
6401 
6402     // simulate content of outputP buffer
6403     std::vector<uint32_t> outputP =
6404         VertexRandomProgram::Arrangement::generateOutputPvector(m_subgroupSize, invocationStride);
6405 
6406     std::vector<tcu::UVec4> ref;
6407     const uint32_t hostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), true, m_subgroupSize,
6408                                                  fragmentStride, invocationStride, ref, log, outputP, nullptr);
6409     log << tcu::TestLog::Message << "Rendering area  : " << tcu::UVec2(m_data.sizeX, m_data.sizeY)
6410         << tcu::TestLog::EndMessage;
6411     log << tcu::TestLog::Message << "invocationStride: " << invocationStride << tcu::TestLog::EndMessage;
6412     log << tcu::TestLog::Message << "Simulated maxLoc: " << hostMaxLoc << tcu::TestLog::EndMessage;
6413     // maxLoc is per-invocation. Add one (to make sure no additional writes are done).
6414     uint32_t maxLoc = hostMaxLoc;
6415     maxLoc += 1;
6416     maxLoc *= invocationStride;
6417 
6418     constexpr uint32_t bufferCount = 4u;
6419     enum Bindings
6420     {
6421         InputA,
6422         OutputBallots,
6423         OutputCounts,
6424         OutputPrimitives
6425     };
6426 
6427     de::MovePtr<BufferWithMemory> buffers[bufferCount];
6428     vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
6429 
6430     uint32_t counts[bufferCount]{// InputA  { uint    a[]; } inputA;
6431                                  uint32_t(m_data.sizeX * m_data.sizeY),
6432                                  // OutputB { uvec2   b[]; } outputB;
6433                                  maxLoc,
6434                                  // OutputC { uint loc[]; } outputC;
6435                                  invocationStride,
6436                                  // OutputP { uint p[]; } outputP;
6437                                  uint32_t(outputP.size())};
6438 
6439     VkDeviceSize sizes[bufferCount]{// InputA  { uint    a[]; } inputA;
6440                                     counts[InputA] * sizeof(uint32_t),
6441                                     // OutputB { uvec2   b[]; } outputB;
6442                                     counts[OutputBallots] * sizeof(tcu::UVec4),
6443                                     // OutputC { uint loc[]; } outputC;
6444                                     counts[OutputCounts] * sizeof(uint32_t),
6445                                     // OutputP { uint p[]; } outputP;
6446                                     counts[OutputPrimitives] * sizeof(uint32_t)};
6447 
6448     const VkBufferUsageFlags cmnUsages = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
6449     VkBufferUsageFlags usages[bufferCount]{
6450         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6451         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6452         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6453         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6454     };
6455 
6456     // allocate buffers
6457     for (uint32_t i = 0; i < bufferCount; ++i)
6458     {
6459         if (sizes[i] > limits.maxStorageBufferRange)
6460             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6461 
6462         try
6463         {
6464             buffers[i] = de::MovePtr<BufferWithMemory>(
6465                 new BufferWithMemory(vk, device, allocator, makeBufferCreateInfo(sizes[i], usages[i] | cmnUsages),
6466                                      MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6467         }
6468         catch (tcu::ResourceError &)
6469         {
6470             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6471             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6472                                    "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
6473         }
6474         bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
6475     }
6476 
6477     // get raw pointers to previously allocated buffers
6478     void *ptrs[bufferCount];
6479     for (uint32_t i = 0; i < bufferCount; ++i)
6480     {
6481         ptrs[i] = buffers[i]->getAllocation().getHostPtr();
6482     }
6483 
6484     // populate buffers with their destination
6485     {
6486         auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], counts[InputA]);
6487         std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
6488     }
6489     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6490     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6491     deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
6492 
6493     // (...) and flush them to the GPU
6494     for (uint32_t i = 0; i < bufferCount; ++i)
6495     {
6496         flushAlloc(vk, device, buffers[i]->getAllocation());
6497     }
6498 
6499     VkDescriptorType descTypes[bufferCount]{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6500                                             VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER};
6501 
6502     vk::DescriptorSetLayoutBuilder layoutBuilder;
6503     for (uint32_t i = 0; i < bufferCount; ++i)
6504     {
6505         layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
6506     }
6507     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
6508 
6509     vk::DescriptorPoolBuilder poolBuilder;
6510     for (uint32_t i = 0; i < bufferCount; ++i)
6511     {
6512         poolBuilder.addType(descTypes[i], 1);
6513     }
6514     vk::Unique<vk::VkDescriptorPool> descriptorPool(
6515         poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
6516     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
6517 
6518     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
6519     for (uint32_t i = 0; i < bufferCount; ++i)
6520     {
6521         setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
6522                                      &bufferDescriptors[i]);
6523     }
6524     setUpdateBuilder.update(vk, device);
6525 
6526     const VkPushConstantRange pushConstantRange{
6527         (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
6528         0u,                                     // uint32_t offset;
6529         sizeof(PushConstant)                    // uint32_t size;
6530     };
6531 
6532     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
6533         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
6534         DE_NULL,                                       // pNext
6535         (VkPipelineLayoutCreateFlags)0,                // flags
6536         1u,                                            // setLayoutCount
6537         &descriptorSetLayout.get(),                    // pSetLayouts
6538         1u,                                            // pushConstantRangeCount
6539         &pushConstantRange,                            // pPushConstantRanges
6540     };
6541 
6542     const uint32_t imageWidth  = m_data.sizeX;
6543     const uint32_t imageHeight = m_data.sizeY;
6544     const VkFormat format      = VK_FORMAT_R8G8B8A8_UNORM;
6545     const VkImageCreateInfo imageCreateInfo{
6546         VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
6547         nullptr,                             // const void* pNext;
6548         VkImageCreateFlags(0),               // VkImageCreateFlags flags;
6549         VK_IMAGE_TYPE_2D,                    // VkImageType imageType;
6550         format,                              // VkFormat format;
6551         {imageWidth, imageHeight, 1u},       // VkExtent3D extent;
6552         1u,                                  // uint32_t mipLevels;
6553         1u,                                  // uint32_t arrayLayers;
6554         VK_SAMPLE_COUNT_1_BIT,               // VkSampleCountFlagBits samples;
6555         VK_IMAGE_TILING_OPTIMAL,             // VkImageTiling tiling;
6556         VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
6557         VK_SHARING_MODE_EXCLUSIVE,           // VkSharingMode sharingMode;
6558         0u,                                  // uint32_t queueFamilyIndexCount;
6559         0u,                                  // const uint32_t* pQueueFamilyIndices;
6560         VK_IMAGE_LAYOUT_UNDEFINED            // VkImageLayout initialLayout;
6561     };
6562     const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
6563     de::MovePtr<ImageWithMemory> image(
6564         new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
6565     Move<VkImageView> view        = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
6566     Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
6567     Move<VkFramebuffer> framebuffer =
6568         makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
6569     de::MovePtr<BufferWithMemory> vertexBuffer  = createVertexBufferAndFlush(m_data.sizeX, m_data.sizeY, topology);
6570     const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
6571     const Shaders shaders                       = createShaders();
6572     Move<VkPipelineLayout> pipelineLayout       = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6573     Move<VkPipeline> pipeline =
6574         createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders, topology, 0u);
6575     Move<VkCommandPool> cmdPool =
6576         createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6577     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6578 
6579     PushConstant pc{};
6580     pc.invocationStride      = invocationStride;
6581     pc.width                 = m_data.sizeX;
6582     pc.height                = m_data.sizeY;
6583     pc.enableInvocationIndex = VK_FALSE;
6584 
6585     auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
6586                                                 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
6587                                                 std::cref(renderBeginInfo), **vertexBuffer, invocationStride, **image);
6588 
6589     // compute "maxLoc", which is a potential maximum number of locations written
6590     callRecordDrawingAndSubmit();
6591 
6592     // Take the maximum of "maxLoc" over all invocations.
6593     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6594     auto rangeLoc               = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], counts[OutputCounts]);
6595     const uint32_t shaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
6596     log << tcu::TestLog::Message << "Computed maxLoc in shader: " << shaderMaxLoc << tcu::TestLog::EndMessage;
6597 
6598     // If we need more space, reallocate OutputB::b[] aka buffers[1]
6599     if (shaderMaxLoc != hostMaxLoc)
6600     {
6601         // Add one (to make sure no additional writes are done) and multiply by
6602         // the number of invocations and current primitive count
6603         maxLoc                = (std::max(shaderMaxLoc, hostMaxLoc) + 1u) * invocationStride;
6604         counts[OutputBallots] = maxLoc;
6605         sizes[OutputBallots]  = counts[OutputBallots] * sizeof(tcu::UVec4);
6606 
6607         if (sizes[OutputBallots] > limits.maxStorageBufferRange)
6608             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6609 
6610         try
6611         {
6612             buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
6613                 vk, device, allocator, makeBufferCreateInfo(sizes[OutputBallots], usages[OutputBallots] | cmnUsages),
6614                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6615         }
6616         catch (tcu::ResourceError &)
6617         {
6618             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6619             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6620                                    "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6621         }
6622         bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
6623         ptrs[OutputBallots]              = buffers[OutputBallots]->getAllocation().getHostPtr();
6624 
6625         vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
6626         setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
6627                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
6628         setUpdateBuilder2.update(vk, device);
6629     }
6630 
6631     // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
6632     // Note that its size would may change since the first memory allocation
6633     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6634     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6635     deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
6636 
6637     // flush them all to the GPU
6638     flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6639     flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6640     flushAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
6641 
6642     // run the actual shader with updated PushConstant
6643     pc.enableInvocationIndex = VK_TRUE;
6644     callRecordDrawingAndSubmit();
6645 
6646     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6647     const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
6648     log << tcu::TestLog::Message << "Final maxLoc from shader: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
6649     if (finalShaderMaxLoc != shaderMaxLoc)
6650     {
6651         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6652                                "maxLoc differs across shader invocations, expected: " + de::toString(shaderMaxLoc) +
6653                                    " got: " + de::toString(finalShaderMaxLoc));
6654     }
6655 
6656     invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6657     const tcu::UVec4 *ballots = static_cast<tcu::UVec4 *>(ptrs[OutputBallots]);
6658 
6659     invalidateAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
6660     auto outputPrange = makeStdBeginEnd<uint32_t>(ptrs[OutputPrimitives], counts[OutputPrimitives]);
6661     std::copy(outputPrange.first, outputPrange.second, outputP.begin());
6662 
6663     try
6664     {
6665         ref.resize(counts[OutputBallots], tcu::UVec4(0u, 0u, 0u, 0u));
6666     }
6667     catch (const std::bad_alloc &)
6668     {
6669         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6670         return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
6671                                "Failed system memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6672     }
6673 
6674     // Simulate execution on the CPU, and compare against the GPU result
6675     const uint32_t finalHostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize,
6676                                                       fragmentStride, invocationStride, ref, log, outputP, ballots);
6677 
6678     const qpTestResult res = calculateAndLogResultEx(log, ballots, ref, finalHostMaxLoc, PrintMode::None);
6679 
6680     return tcu::TestStatus(res, qpGetTestResultName(res));
6681 }
6682 
calculateAndLogResultEx(add_ref<tcu::TestLog> log,const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,const uint32_t maxLoc,const PrintMode printMode)6683 qpTestResult_e ReconvergenceTestVertexInstance::calculateAndLogResultEx(add_ref<tcu::TestLog> log,
6684                                                                         const tcu::UVec4 *result,
6685                                                                         const std::vector<tcu::UVec4> &ref,
6686                                                                         const uint32_t maxLoc,
6687                                                                         const PrintMode printMode)
6688 {
6689     DE_UNREF(maxLoc);
6690     DE_UNREF(printMode);
6691 
6692     qpTestResult res                  = QP_TEST_RESULT_PASS;
6693     uint32_t mismatchCount            = 0u;
6694     const uint32_t printMismatchCount = 5u;
6695 
6696     // With maximal reconvergence, we should expect the output to exactly match the reference.
6697     const uint32_t ballotStoreCount = static_cast<uint32_t>(ref.size());
6698     for (uint32_t i = 0; i < ballotStoreCount; ++i)
6699     {
6700         const Ballot resultVal(result[i], m_subgroupSize);
6701         const Ballot refVal(ref.at(i), m_subgroupSize);
6702         if (resultVal != refVal)
6703         {
6704             if (mismatchCount++ < printMismatchCount)
6705             {
6706                 res = QP_TEST_RESULT_FAIL;
6707                 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
6708                     << "\n     got: " << refVal << tcu::TestLog::EndMessage;
6709                 if (printMode == PrintMode::Console)
6710                 {
6711                     std::cout << "Mismatch at " << i << "\nexpected: " << resultVal << "\n     got: " << refVal
6712                               << std::endl;
6713                 }
6714             }
6715         }
6716     }
6717 
6718     log << tcu::TestLog::Message << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount
6719         << tcu::TestLog::EndMessage;
6720     if (printMode == PrintMode::Console)
6721     {
6722         std::cout << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount << std::endl;
6723     }
6724 
6725     return res;
6726 }
6727 
createShaders(void)6728 std::vector<Move<VkShaderModule>> ReconvergenceTestTessCtrlInstance::createShaders(void)
6729 {
6730     const DeviceInterface &vk = m_context.getDeviceInterface();
6731     const VkDevice device     = m_context.getDevice();
6732 
6733     Move<VkShaderModule> vertex     = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"));
6734     Move<VkShaderModule> fragment   = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"));
6735     Move<VkShaderModule> control    = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"));
6736     Move<VkShaderModule> evaluation = createShaderModule(vk, device, m_context.getBinaryCollection().get("tese"));
6737 
6738     // { #vert, #frag, #tesc, #tese, geom }; if any
6739     std::vector<Move<VkShaderModule>> shaders;
6740     shaders.emplace_back(vertex);
6741     shaders.emplace_back(fragment);
6742     shaders.emplace_back(control);
6743     shaders.emplace_back(evaluation);
6744 
6745     return shaders;
6746 }
6747 
iterate(void)6748 tcu::TestStatus ReconvergenceTestTessCtrlInstance::iterate(void)
6749 {
6750     const DeviceInterface &vk = m_context.getDeviceInterface();
6751     const VkDevice device     = m_context.getDevice();
6752     Allocator &allocator      = m_context.getDefaultAllocator();
6753     const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
6754     add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
6755 
6756     if (m_subgroupSize < TessCtrlRandomProgram::minSubgroupSize || m_subgroupSize > 64)
6757     {
6758         std::stringstream str;
6759         str << "Subgroup size less than " << TessCtrlRandomProgram::minSubgroupSize
6760             << " or greater than 64 not handled.";
6761         str.flush();
6762         TCU_THROW(TestError, str.str());
6763     }
6764 
6765     deRandom rnd;
6766     deRandom_init(&rnd, m_data.seed);
6767 
6768     vk::VkPhysicalDeviceProperties2 properties2;
6769     deMemset(&properties2, 0, sizeof(properties2));
6770     properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
6771     m_context.getInstanceInterface().getPhysicalDeviceProperties2(m_context.getPhysicalDevice(), &properties2);
6772     const VkPhysicalDeviceLimits &limits = properties2.properties.limits;
6773 
6774     const uint32_t patchControlPoints = 1;
6775     const uint32_t vertexCount =
6776         (m_subgroupSize / TessCtrlRandomProgram::minSubgroupSize) * patchControlPoints * m_data.sizeX;
6777     const uint32_t primitiveStride = vertexCount / patchControlPoints;
6778     de::MovePtr<BufferWithMemory> vertexBuffer =
6779         createVertexBufferAndFlush(vertexCount, 1u, VK_PRIMITIVE_TOPOLOGY_PATCH_LIST);
6780     const uint32_t invocationStride = vertexCount * TessCtrlRandomProgram::minSubgroupSize;
6781     DE_ASSERT(invocationStride < MAX_INVOCATIONS_ALL_TESTS);
6782 
6783     log << tcu::TestLog::Message << "LayoutVertexOut:    " << (uint32_t)TessCtrlRandomProgram::minSubgroupSize
6784         << tcu::TestLog::EndMessage;
6785     log << tcu::TestLog::Message << "patchControlPoints: " << patchControlPoints << tcu::TestLog::EndMessage;
6786     log << tcu::TestLog::Message << "primitiveStride:    " << primitiveStride << tcu::TestLog::EndMessage;
6787     log << tcu::TestLog::Message << "invocationStride:   " << invocationStride << tcu::TestLog::EndMessage;
6788     log << tcu::TestLog::Message << "usedSubgroupCount:  " << m_data.sizeX << tcu::TestLog::EndMessage;
6789 
6790     de::MovePtr<TessCtrlRandomProgram> program(new TessCtrlRandomProgram(m_data, invocationStride));
6791     program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
6792 
6793     std::vector<uint64_t> ref;
6794     const uint32_t simulationMaxLoc = program->simulate(true, m_subgroupSize, ref);
6795     log << tcu::TestLog::Message << "simulated maxLoc: " << simulationMaxLoc << tcu::TestLog::EndMessage;
6796     // maxLoc is per-invocation. Add one (to make sure no additional writes are done)
6797     uint32_t maxLoc = simulationMaxLoc;
6798     maxLoc += 1;
6799     maxLoc *= invocationStride;
6800 
6801     constexpr uint32_t bufferCount = 3;
6802     enum Bindings
6803     {
6804         InputA,
6805         OutputBallots,
6806         OutputCounts,
6807     };
6808 
6809     de::MovePtr<BufferWithMemory> buffers[bufferCount];
6810     vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
6811 
6812     VkDeviceSize sizes[bufferCount]{
6813         // InputA  { uint    a[]; } inputA;  filled with a[i] == i
6814         invocationStride * sizeof(uint32_t),
6815         // OutputB { uvec2   b[]; } outputB;
6816         maxLoc * sizeof(uint64_t),
6817         // OutputC { uint loc[]; } outputC;
6818         invocationStride * sizeof(uint32_t),
6819     };
6820 
6821     VkBufferUsageFlags usages[bufferCount]{
6822         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6823         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6824         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6825     };
6826 
6827     // allocate buffers
6828     for (uint32_t i = 0; i < bufferCount; ++i)
6829     {
6830         if (sizes[i] > limits.maxStorageBufferRange)
6831             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6832 
6833         try
6834         {
6835             buffers[i] = de::MovePtr<BufferWithMemory>(
6836                 new BufferWithMemory(vk, device, allocator,
6837                                      makeBufferCreateInfo(sizes[i], usages[i] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6838                                                                         VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6839                                      MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6840         }
6841         catch (tcu::ResourceError &)
6842         {
6843             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6844             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6845                                    "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
6846         }
6847         bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
6848     }
6849 
6850     // get raw pointers to previously allocated buffers
6851     void *ptrs[bufferCount];
6852     for (uint32_t i = 0; i < bufferCount; ++i)
6853     {
6854         ptrs[i] = (uint32_t *)buffers[i]->getAllocation().getHostPtr();
6855     }
6856 
6857     // populate buffers with their destination
6858     {
6859         auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], invocationStride);
6860         std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
6861     }
6862     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6863     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6864 
6865     // (...) and flush them to the GPU
6866     for (uint32_t i = 0; i < bufferCount; ++i)
6867     {
6868         flushAlloc(vk, device, buffers[i]->getAllocation());
6869     }
6870 
6871     VkDescriptorType descTypes[bufferCount]{
6872         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6873         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6874         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6875     };
6876 
6877     vk::DescriptorSetLayoutBuilder layoutBuilder;
6878     for (uint32_t i = 0; i < bufferCount; ++i)
6879     {
6880         layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
6881     }
6882     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
6883 
6884     vk::DescriptorPoolBuilder poolBuilder;
6885     for (uint32_t i = 0; i < bufferCount; ++i)
6886     {
6887         poolBuilder.addType(descTypes[i], 1);
6888     }
6889     vk::Unique<vk::VkDescriptorPool> descriptorPool(
6890         poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
6891     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
6892 
6893     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
6894     for (uint32_t i = 0; i < bufferCount; ++i)
6895     {
6896         setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
6897                                      &bufferDescriptors[i]);
6898     }
6899     setUpdateBuilder.update(vk, device);
6900 
6901     const VkPushConstantRange pushConstantRange{
6902         (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
6903         0u,                                     // uint32_t offset;
6904         sizeof(PushConstant)                    // uint32_t size;
6905     };
6906 
6907     // TODO: verify that PushConstant is available on running machine
6908 
6909     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
6910         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
6911         DE_NULL,                                       // pNext
6912         (VkPipelineLayoutCreateFlags)0,                // flags
6913         1u,                                            // setLayoutCount
6914         &descriptorSetLayout.get(),                    // pSetLayouts
6915         1u,                                            // pushConstantRangeCount
6916         &pushConstantRange,                            // pPushConstantRanges
6917     };
6918 
6919     const uint32_t imageWidth  = 256;
6920     const uint32_t imageHeight = 256;
6921     const VkFormat format      = VK_FORMAT_R8G8B8A8_UNORM;
6922     const VkImageCreateInfo imageCreateInfo{
6923         VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
6924         nullptr,                             // const void* pNext;
6925         VkImageCreateFlags(0),               // VkImageCreateFlags flags;
6926         VK_IMAGE_TYPE_2D,                    // VkImageType imageType;
6927         format,                              // VkFormat format;
6928         {imageWidth, imageHeight, 1u},       // VkExtent3D extent;
6929         1u,                                  // uint32_t mipLevels;
6930         1u,                                  // uint32_t arrayLayers;
6931         VK_SAMPLE_COUNT_1_BIT,               // VkSampleCountFlagBits samples;
6932         VK_IMAGE_TILING_OPTIMAL,             // VkImageTiling tiling;
6933         VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
6934         VK_SHARING_MODE_EXCLUSIVE,           // VkSharingMode sharingMode;
6935         0u,                                  // uint32_t queueFamilyIndexCount;
6936         0u,                                  // const uint32_t* pQueueFamilyIndices;
6937         VK_IMAGE_LAYOUT_UNDEFINED            // VkImageLayout initialLayout;
6938     };
6939     const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
6940     de::MovePtr<ImageWithMemory> image(
6941         new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
6942     Move<VkImageView> view        = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
6943     Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
6944     Move<VkFramebuffer> framebuffer =
6945         makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
6946     const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
6947     const Shaders shaders                       = createShaders();
6948     Move<VkPipelineLayout> pipelineLayout       = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6949     Move<VkPipeline> pipeline = createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders,
6950                                                        VK_PRIMITIVE_TOPOLOGY_PATCH_LIST, patchControlPoints);
6951     Move<VkCommandPool> cmdPool =
6952         createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6953     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6954 
6955     PushConstant pc{};
6956     pc.invocationStride = 0u;
6957     pc.width            = TessCtrlRandomProgram::minSubgroupSize;
6958     pc.height           = patchControlPoints;
6959     pc.primitiveStride  = primitiveStride;
6960 
6961     auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
6962                                                 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
6963                                                 std::cref(renderBeginInfo), **vertexBuffer, vertexCount, **image);
6964 
6965     // compute "maxLoc", which is a potential maximum number of locations written
6966     callRecordDrawingAndSubmit();
6967 
6968     // Take the maximum of "maxLoc" over all invocations.
6969     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6970     auto rangeLoc                       = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], invocationStride);
6971     const uint32_t computedShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
6972     log << tcu::TestLog::Message << "computed shaderMaxLoc: " << computedShaderMaxLoc << tcu::TestLog::EndMessage;
6973 
6974     // If we need more space, reallocate OutputB::b[] aka buffers[1]
6975     if (computedShaderMaxLoc > simulationMaxLoc)
6976     {
6977         // Add one (to make sure no additional writes are done) and multiply by
6978         // the number of invocations and current primitive count
6979         maxLoc               = (computedShaderMaxLoc + 1) * invocationStride;
6980         sizes[OutputBallots] = maxLoc * sizeof(uint64_t);
6981 
6982         if (sizes[OutputBallots] > limits.maxStorageBufferRange)
6983             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6984 
6985         try
6986         {
6987             buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
6988                 vk, device, allocator,
6989                 makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6990                                                    VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6991                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6992         }
6993         catch (tcu::ResourceError &)
6994         {
6995             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6996             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6997                                    "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6998         }
6999         bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
7000         ptrs[OutputBallots]              = buffers[OutputBallots]->getAllocation().getHostPtr();
7001 
7002         vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
7003         setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
7004                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
7005         setUpdateBuilder2.update(vk, device);
7006     }
7007 
7008     // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
7009     // Note that its size would may change since the first memory allocation
7010     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7011     // Clear any writes to counting OutputC::loc[] aka buffer[2] during the counting pass
7012     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7013 
7014     // flush them all to the GPU
7015     flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7016     flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7017 
7018     // run the actual shader with updated PushConstant
7019     pc.invocationStride = invocationStride;
7020     pc.width            = TessCtrlRandomProgram::minSubgroupSize;
7021     pc.height           = patchControlPoints;
7022     pc.primitiveStride  = primitiveStride;
7023     callRecordDrawingAndSubmit();
7024 
7025     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7026     const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7027     log << tcu::TestLog::Message << "final shaderMaxLoc: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
7028     if (finalShaderMaxLoc > computedShaderMaxLoc)
7029     {
7030         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING, "maxLoc differs across shader invocations");
7031     }
7032 
7033     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7034     invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7035 
7036     // Simulate execution on the CPU, and compare against the GPU result
7037     try
7038     {
7039         ref.resize(maxLoc, 0ull);
7040     }
7041     catch (const std::bad_alloc &)
7042     {
7043         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7044         return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
7045                                "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
7046     }
7047 
7048     program->simulate(false, m_subgroupSize, ref);
7049 
7050     const uint64_t *ballots = static_cast<uint64_t *>(ptrs[OutputBallots]);
7051     qpTestResult res        = calculateAndLogResult(ballots, ref, invocationStride, m_subgroupSize, finalShaderMaxLoc,
7052                                                     (invocationStride / 3), PrintMode::None);
7053 
7054     return tcu::TestStatus(res, qpGetTestResultName(res));
7055 }
7056 
createShaders(void)7057 std::vector<Move<VkShaderModule>> ReconvergenceTestTessEvalInstance::createShaders(void)
7058 {
7059     const DeviceInterface &vk = m_context.getDeviceInterface();
7060     const VkDevice device     = m_context.getDevice();
7061 
7062     Move<VkShaderModule> vertex     = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"));
7063     Move<VkShaderModule> fragment   = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"));
7064     Move<VkShaderModule> control    = createShaderModule(vk, device, m_context.getBinaryCollection().get("tesc"));
7065     Move<VkShaderModule> evaluation = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"));
7066 
7067     // { #vert, #frag, #tesc, #tese, geom }; if any
7068     std::vector<Move<VkShaderModule>> shaders;
7069     shaders.emplace_back(vertex);
7070     shaders.emplace_back(fragment);
7071     shaders.emplace_back(control);
7072     shaders.emplace_back(evaluation);
7073 
7074     return shaders;
7075 }
7076 
iterate(void)7077 tcu::TestStatus ReconvergenceTestTessEvalInstance::iterate(void)
7078 {
7079     const DeviceInterface &vk = m_context.getDeviceInterface();
7080     const VkDevice device     = m_context.getDevice();
7081     Allocator &allocator      = m_context.getDefaultAllocator();
7082     const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
7083     add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
7084 
7085     if (m_subgroupSize < TessEvalRandomProgram::quadInvocationCount || m_subgroupSize > 64)
7086     {
7087         std::stringstream str;
7088         str << "Subgroup size less than " << TessEvalRandomProgram::quadInvocationCount
7089             << " or greater than 64 not handled.";
7090         str.flush();
7091         TCU_THROW(TestError, str.str());
7092     }
7093 
7094     deRandom rnd;
7095     deRandom_init(&rnd, m_data.seed);
7096 
7097     vk::VkPhysicalDeviceProperties2 properties2;
7098     deMemset(&properties2, 0, sizeof(properties2));
7099     properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
7100     m_context.getInstanceInterface().getPhysicalDeviceProperties2(m_context.getPhysicalDevice(), &properties2);
7101     const VkPhysicalDeviceLimits &limits = properties2.properties.limits;
7102 
7103     const uint32_t patchesPerGroup             = m_subgroupSize / TessEvalRandomProgram::quadInvocationCount;
7104     const uint32_t primitiveStride             = patchesPerGroup * m_data.sizeX;
7105     const uint32_t invocationStride            = primitiveStride * TessEvalRandomProgram::quadInvocationCount;
7106     const std::vector<tcu::Vec4> vertices      = generateVertices(invocationStride, VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
7107     const uint32_t vertexCount                 = uint32_t(vertices.size());
7108     de::MovePtr<BufferWithMemory> vertexBuffer = createVertexBufferAndFlush(vertices);
7109 
7110     DE_ASSERT(invocationStride <= MAX_INVOCATIONS_ALL_TESTS);
7111 
7112     de::MovePtr<TessEvalRandomProgram> program(new TessEvalRandomProgram(m_data, invocationStride));
7113     program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
7114 
7115     std::vector<uint64_t> ref;
7116     const uint32_t simulationMaxLoc = program->simulate(true, m_subgroupSize, ref);
7117     log << tcu::TestLog::Message << "simulated maxLoc:       " << simulationMaxLoc << tcu::TestLog::EndMessage;
7118     log << tcu::TestLog::Message << "effective patch size:   " << m_data.sizeY << tcu::TestLog::EndMessage;
7119     log << tcu::TestLog::Message << "effective patch count:  " << primitiveStride << tcu::TestLog::EndMessage;
7120     log << tcu::TestLog::Message << "total invocation count: " << invocationStride << tcu::TestLog::EndMessage;
7121 
7122     // maxLoc is per-invocation. Add one (to make sure no additional writes are done).
7123     uint32_t maxLoc = simulationMaxLoc;
7124     maxLoc += 1;
7125     maxLoc *= invocationStride;
7126 
7127     constexpr uint32_t bufferCount = 3;
7128     enum Bindings
7129     {
7130         InputA,
7131         OutputBallots,
7132         OutputCounts,
7133     };
7134 
7135     de::MovePtr<BufferWithMemory> buffers[bufferCount];
7136     vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
7137 
7138     VkDeviceSize sizes[bufferCount]{
7139         // InputA  { uint    a[]; } inputA;  filled with a[i] == i
7140         invocationStride * sizeof(uint32_t),
7141         // OutputB { uvec2   b[]; } outputB;
7142         maxLoc * sizeof(uint64_t),
7143         // OutputC { uint loc[]; } outputC;
7144         invocationStride * sizeof(uint32_t),
7145     };
7146 
7147     VkBufferUsageFlags usages[bufferCount]{
7148         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7149         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7150         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7151     };
7152 
7153     // allocate buffers
7154     for (uint32_t i = 0; i < bufferCount; ++i)
7155     {
7156         if (sizes[i] > limits.maxStorageBufferRange)
7157             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7158 
7159         try
7160         {
7161             buffers[i] = de::MovePtr<BufferWithMemory>(
7162                 new BufferWithMemory(vk, device, allocator,
7163                                      makeBufferCreateInfo(sizes[i], usages[i] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
7164                                                                         VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
7165                                      MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7166         }
7167         catch (tcu::ResourceError &)
7168         {
7169             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7170             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7171                                    "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
7172         }
7173         bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
7174     }
7175 
7176     // get raw pointers to previously allocated buffers
7177     void *ptrs[bufferCount];
7178     for (uint32_t i = 0; i < bufferCount; ++i)
7179     {
7180         ptrs[i] = (uint32_t *)buffers[i]->getAllocation().getHostPtr();
7181     }
7182 
7183     // populate buffers with their destination
7184     {
7185         auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], invocationStride);
7186         std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
7187     }
7188     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7189     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7190 
7191     // (...) and flush them to the GPU
7192     for (uint32_t i = 0; i < bufferCount; ++i)
7193     {
7194         flushAlloc(vk, device, buffers[i]->getAllocation());
7195     }
7196 
7197     VkDescriptorType descTypes[bufferCount]{
7198         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7199         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7200         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7201     };
7202 
7203     vk::DescriptorSetLayoutBuilder layoutBuilder;
7204     for (uint32_t i = 0; i < bufferCount; ++i)
7205     {
7206         layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
7207     }
7208     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
7209 
7210     vk::DescriptorPoolBuilder poolBuilder;
7211     for (uint32_t i = 0; i < bufferCount; ++i)
7212     {
7213         poolBuilder.addType(descTypes[i], 1);
7214     }
7215     vk::Unique<vk::VkDescriptorPool> descriptorPool(
7216         poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
7217     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
7218 
7219     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
7220     for (uint32_t i = 0; i < bufferCount; ++i)
7221     {
7222         setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
7223                                      &bufferDescriptors[i]);
7224     }
7225     setUpdateBuilder.update(vk, device);
7226 
7227     const VkPushConstantRange pushConstantRange{
7228         (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
7229         0u,                                     // uint32_t offset;
7230         sizeof(PushConstant)                    // uint32_t size;
7231     };
7232 
7233     // TODO: verify that PushConstant is available on running machine
7234 
7235     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
7236         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
7237         DE_NULL,                                       // pNext
7238         (VkPipelineLayoutCreateFlags)0,                // flags
7239         1u,                                            // setLayoutCount
7240         &descriptorSetLayout.get(),                    // pSetLayouts
7241         1u,                                            // pushConstantRangeCount
7242         &pushConstantRange,                            // pPushConstantRanges
7243     };
7244 
7245     const uint32_t imageWidth  = 256;
7246     const uint32_t imageHeight = 256;
7247     const VkFormat format      = VK_FORMAT_R8G8B8A8_UNORM;
7248     const VkImageCreateInfo imageCreateInfo{
7249         VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
7250         nullptr,                             // const void* pNext;
7251         VkImageCreateFlags(0),               // VkImageCreateFlags flags;
7252         VK_IMAGE_TYPE_2D,                    // VkImageType imageType;
7253         format,                              // VkFormat format;
7254         {imageWidth, imageHeight, 1u},       // VkExtent3D extent;
7255         1u,                                  // uint32_t mipLevels;
7256         1u,                                  // uint32_t arrayLayers;
7257         VK_SAMPLE_COUNT_1_BIT,               // VkSampleCountFlagBits samples;
7258         VK_IMAGE_TILING_OPTIMAL,             // VkImageTiling tiling;
7259         VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
7260         VK_SHARING_MODE_EXCLUSIVE,           // VkSharingMode sharingMode;
7261         0u,                                  // uint32_t queueFamilyIndexCount;
7262         0u,                                  // const uint32_t* pQueueFamilyIndices;
7263         VK_IMAGE_LAYOUT_UNDEFINED            // VkImageLayout initialLayout;
7264     };
7265     const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
7266     de::MovePtr<ImageWithMemory> image(
7267         new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
7268     Move<VkImageView> view        = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
7269     Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
7270     Move<VkFramebuffer> framebuffer =
7271         makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
7272     const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
7273     const Shaders shaders                       = createShaders();
7274     Move<VkPipelineLayout> pipelineLayout       = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
7275     Move<VkPipeline> pipeline =
7276         createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders,
7277                                VK_PRIMITIVE_TOPOLOGY_PATCH_LIST, TessEvalRandomProgram::quadInvocationCount);
7278     Move<VkCommandPool> cmdPool =
7279         createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
7280     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7281 
7282     PushConstant pc{};
7283     pc.invocationStride = 0u;
7284     pc.width            = TessEvalRandomProgram::quadInvocationCount;
7285 
7286     auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
7287                                                 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
7288                                                 std::cref(renderBeginInfo), **vertexBuffer, vertexCount, **image);
7289 
7290     // compute "maxLoc", which is a potential maximum number of locations written
7291     callRecordDrawingAndSubmit();
7292 
7293     // Take the maximum of "maxLoc" over all invocations.
7294     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7295     auto rangeLoc                       = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], invocationStride);
7296     const uint32_t computedShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7297     log << tcu::TestLog::Message << "computed shaderMaxLoc: " << computedShaderMaxLoc << tcu::TestLog::EndMessage;
7298 
7299     // If we need more space, reallocate OutputB::b[] aka buffers[1]
7300     if (computedShaderMaxLoc > simulationMaxLoc)
7301     {
7302         // Add one (to make sure no additional writes are done) and multiply by
7303         // the number of invocations and current primitive count
7304         maxLoc               = (computedShaderMaxLoc + 1) * invocationStride;
7305         sizes[OutputBallots] = maxLoc * sizeof(uint64_t);
7306 
7307         if (sizes[OutputBallots] > limits.maxStorageBufferRange)
7308             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7309 
7310         try
7311         {
7312             buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
7313                 vk, device, allocator,
7314                 makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
7315                                                    VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
7316                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7317         }
7318         catch (tcu::ResourceError &)
7319         {
7320             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7321             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7322                                    "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
7323         }
7324         bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
7325         ptrs[OutputBallots]              = buffers[OutputBallots]->getAllocation().getHostPtr();
7326 
7327         vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
7328         setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
7329                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
7330         setUpdateBuilder2.update(vk, device);
7331     }
7332 
7333     // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
7334     // Note that its size would may change since the first memory allocation
7335     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7336     // Clear any writes to counting OutputC::loc[] aka buffer[2] during the counting pass
7337     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7338 
7339     // flush them all to the GPU
7340     flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7341     flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7342 
7343     // run the actual shader with updated PushConstant
7344     pc.invocationStride = invocationStride;
7345     pc.width            = TessEvalRandomProgram::quadInvocationCount;
7346     callRecordDrawingAndSubmit();
7347 
7348     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7349     const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7350     log << tcu::TestLog::Message << "final shaderMaxLoc: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
7351     if (finalShaderMaxLoc > computedShaderMaxLoc)
7352     {
7353         std::stringstream s;
7354         s << "maxLoc differs across shader invocations: " << finalShaderMaxLoc << " and " << computedShaderMaxLoc;
7355         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING, s.str());
7356     }
7357 
7358     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7359     invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7360 
7361     // Simulate execution on the CPU, and compare against the GPU result
7362     try
7363     {
7364         ref.resize(maxLoc, 0ull);
7365     }
7366     catch (const std::bad_alloc &)
7367     {
7368         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7369         return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
7370                                "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
7371     }
7372 
7373     program->simulate(false, m_subgroupSize, ref);
7374 
7375     const uint64_t *ballots = static_cast<uint64_t *>(ptrs[OutputBallots]);
7376     qpTestResult res        = calculateAndLogResult(ballots, ref, invocationStride, m_subgroupSize, finalShaderMaxLoc,
7377                                                     (invocationStride / 3), PrintMode::None);
7378 
7379     return tcu::TestStatus(res, qpGetTestResultName(res));
7380 }
7381 
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)7382 de::MovePtr<BufferWithMemory> ReconvergenceTestGeometryInstance::createVertexBufferAndFlush(
7383     uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
7384 {
7385     DE_UNREF(topology);
7386     DE_ASSERT(VK_PRIMITIVE_TOPOLOGY_POINT_LIST == topology);
7387     const std::vector<tcu::Vec4> vertices = GeometryRandomProgram::Arrangement::generatePrimitives(
7388         cellsHorz, cellsVert, GeometryRandomProgram::fillPercentage);
7389     return ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(vertices);
7390 }
7391 
createShaders(void)7392 std::vector<Move<VkShaderModule>> ReconvergenceTestGeometryInstance::createShaders(void)
7393 {
7394     const DeviceInterface &vk = m_context.getDeviceInterface();
7395     const VkDevice device     = m_context.getDevice();
7396 
7397     Move<VkShaderModule> vertex   = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"));
7398     Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"));
7399     Move<VkShaderModule> geometry = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"));
7400 
7401     // { #vert, #frag, tesc, tese, #geom }; if any
7402     std::vector<Move<VkShaderModule>> shaders;
7403     shaders.emplace_back(vertex);
7404     shaders.emplace_back(fragment);
7405     shaders.emplace_back();
7406     shaders.emplace_back();
7407     shaders.emplace_back(geometry);
7408 
7409     return shaders;
7410 }
7411 
iterate(void)7412 tcu::TestStatus ReconvergenceTestGeometryInstance::iterate(void)
7413 {
7414     const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
7415     if (sizeof(PushConstant) > limits.maxPushConstantsSize)
7416     {
7417         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7418                                "PushConstant size " + std::to_string(sizeof(PushConstant)) + " exceeds device limit " +
7419                                    std::to_string(limits.maxPushConstantsSize));
7420     }
7421 
7422     const DeviceInterface &vk          = m_context.getDeviceInterface();
7423     const VkDevice device              = m_context.getDevice();
7424     Allocator &allocator               = m_context.getDefaultAllocator();
7425     const uint32_t queueIndex          = m_context.getUniversalQueueFamilyIndex();
7426     add_ref<tcu::TestLog> log          = m_context.getTestContext().getLog();
7427     const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
7428     const uint32_t fragmentStride      = uint32_t(m_data.sizeX * m_data.sizeY);
7429     const uint32_t invocationStride    = GeometryRandomProgram::Arrangement::calculatePrimitiveCount(
7430         m_data.sizeX, m_data.sizeY, GeometryRandomProgram::fillPercentage);
7431 
7432     de::MovePtr<GeometryRandomProgram> program(new GeometryRandomProgram(m_data));
7433     program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
7434 
7435     // simulate content of outputP buffer
7436     std::vector<uint32_t> outputP =
7437         GeometryRandomProgram::Arrangement::generateVectorOutputP(m_subgroupSize, invocationStride);
7438 
7439     std::vector<tcu::UVec4> ref;
7440     const uint32_t hostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), true, m_subgroupSize,
7441                                                  fragmentStride, invocationStride, ref, log, outputP, nullptr);
7442     log << tcu::TestLog::Message << "Rendering area  : " << tcu::UVec2(m_data.sizeX, m_data.sizeY)
7443         << tcu::TestLog::EndMessage;
7444     log << tcu::TestLog::Message << "invocationStride: " << invocationStride << tcu::TestLog::EndMessage;
7445     log << tcu::TestLog::Message << "Simulated maxLoc: " << hostMaxLoc << tcu::TestLog::EndMessage;
7446     // maxLoc is per-invocation. Add one (to make sure no additional writes are done).
7447     uint32_t maxLoc = hostMaxLoc;
7448     maxLoc += 1;
7449     maxLoc *= invocationStride;
7450 
7451     constexpr uint32_t bufferCount = 4u;
7452     enum Bindings
7453     {
7454         InputA,
7455         OutputBallots,
7456         OutputCounts,
7457         OutputPrimitives
7458     };
7459 
7460     de::MovePtr<BufferWithMemory> buffers[bufferCount];
7461     vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
7462 
7463     uint32_t counts[bufferCount]{// InputA  { uint    a[]; } inputA;
7464                                  uint32_t(m_data.sizeX * m_data.sizeY),
7465                                  // OutputB { uvec2   b[]; } outputB;
7466                                  maxLoc,
7467                                  // OutputC { uint loc[]; } outputC;
7468                                  invocationStride,
7469                                  // OutputP { uint p[]; } outputP;
7470                                  uint32_t(outputP.size())};
7471 
7472     VkDeviceSize sizes[bufferCount]{// InputA  { uint    a[]; } inputA;
7473                                     counts[InputA] * sizeof(uint32_t),
7474                                     // OutputB { uvec2   b[]; } outputB;
7475                                     counts[OutputBallots] * sizeof(tcu::UVec4),
7476                                     // OutputC { uint loc[]; } outputC;
7477                                     counts[OutputCounts] * sizeof(uint32_t),
7478                                     // OutputP { uint p[]; } outputP;
7479                                     counts[OutputPrimitives] * sizeof(uint32_t)};
7480 
7481     const VkBufferUsageFlags cmnUsages = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
7482     VkBufferUsageFlags usages[bufferCount]{
7483         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7484         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7485         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7486         VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7487     };
7488 
7489     // allocate buffers
7490     for (uint32_t i = 0; i < bufferCount; ++i)
7491     {
7492         if (sizes[i] > limits.maxStorageBufferRange)
7493             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7494         try
7495         {
7496             buffers[i] = de::MovePtr<BufferWithMemory>(
7497                 new BufferWithMemory(vk, device, allocator, makeBufferCreateInfo(sizes[i], usages[i] | cmnUsages),
7498                                      MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7499         }
7500         catch (tcu::ResourceError &)
7501         {
7502             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7503             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7504                                    "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
7505         }
7506         bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
7507     }
7508 
7509     // get raw pointers to previously allocated buffers
7510     void *ptrs[bufferCount];
7511     for (uint32_t i = 0; i < bufferCount; ++i)
7512     {
7513         ptrs[i] = (uint32_t *)buffers[i]->getAllocation().getHostPtr();
7514     }
7515 
7516     // populate buffers with their destination
7517     {
7518         auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], counts[InputA]);
7519         std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
7520     }
7521     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7522     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7523     deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
7524 
7525     // (...) and flush them to the GPU
7526     for (uint32_t i = 0; i < bufferCount; ++i)
7527     {
7528         flushAlloc(vk, device, buffers[i]->getAllocation());
7529     }
7530 
7531     VkDescriptorType descTypes[bufferCount]{
7532         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7533         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7534         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7535         VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7536     };
7537 
7538     vk::DescriptorSetLayoutBuilder layoutBuilder;
7539     for (uint32_t i = 0; i < bufferCount; ++i)
7540     {
7541         layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
7542     }
7543     vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
7544 
7545     vk::DescriptorPoolBuilder poolBuilder;
7546     for (uint32_t i = 0; i < bufferCount; ++i)
7547     {
7548         poolBuilder.addType(descTypes[i], 1);
7549     }
7550     vk::Unique<vk::VkDescriptorPool> descriptorPool(
7551         poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
7552     vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
7553 
7554     vk::DescriptorSetUpdateBuilder setUpdateBuilder;
7555     for (uint32_t i = 0; i < bufferCount; ++i)
7556     {
7557         setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
7558                                      &bufferDescriptors[i]);
7559     }
7560     setUpdateBuilder.update(vk, device);
7561 
7562     const VkPushConstantRange pushConstantRange{
7563         (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
7564         0u,                                     // uint32_t offset;
7565         sizeof(PushConstant)                    // uint32_t size;
7566     };
7567 
7568     const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
7569         VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
7570         DE_NULL,                                       // pNext
7571         (VkPipelineLayoutCreateFlags)0,                // flags
7572         1u,                                            // setLayoutCount
7573         &descriptorSetLayout.get(),                    // pSetLayouts
7574         1u,                                            // pushConstantRangeCount
7575         &pushConstantRange,                            // pPushConstantRanges
7576     };
7577 
7578     const uint32_t imageWidth  = m_data.sizeX;
7579     const uint32_t imageHeight = m_data.sizeY;
7580     const VkFormat format      = VK_FORMAT_R8G8B8A8_UNORM;
7581     const VkImageCreateInfo imageCreateInfo{
7582         VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
7583         nullptr,                             // const void* pNext;
7584         VkImageCreateFlags(0),               // VkImageCreateFlags flags;
7585         VK_IMAGE_TYPE_2D,                    // VkImageType imageType;
7586         format,                              // VkFormat format;
7587         {imageWidth, imageHeight, 1u},       // VkExtent3D extent;
7588         1u,                                  // uint32_t mipLevels;
7589         1u,                                  // uint32_t arrayLayers;
7590         VK_SAMPLE_COUNT_1_BIT,               // VkSampleCountFlagBits samples;
7591         VK_IMAGE_TILING_OPTIMAL,             // VkImageTiling tiling;
7592         VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
7593         VK_SHARING_MODE_EXCLUSIVE,           // VkSharingMode sharingMode;
7594         0u,                                  // uint32_t queueFamilyIndexCount;
7595         0u,                                  // const uint32_t* pQueueFamilyIndices;
7596         VK_IMAGE_LAYOUT_UNDEFINED            // VkImageLayout initialLayout;
7597     };
7598     const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
7599     de::MovePtr<ImageWithMemory> image(
7600         new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
7601     Move<VkImageView> view        = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
7602     Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
7603     Move<VkFramebuffer> framebuffer =
7604         makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
7605     de::MovePtr<BufferWithMemory> vertexBuffer  = createVertexBufferAndFlush(m_data.sizeX, m_data.sizeY, topology);
7606     const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
7607     const Shaders shaders                       = createShaders();
7608     Move<VkPipelineLayout> pipelineLayout       = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
7609     Move<VkPipeline> pipeline = createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders,
7610                                                        VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
7611     Move<VkCommandPool> cmdPool =
7612         createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
7613     Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7614 
7615     PushConstant pc{};
7616     pc.invocationStride      = invocationStride;
7617     pc.width                 = m_data.sizeX;
7618     pc.height                = m_data.sizeY;
7619     pc.enableInvocationIndex = VK_FALSE;
7620 
7621     auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
7622                                                 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
7623                                                 std::cref(renderBeginInfo), **vertexBuffer, invocationStride, **image);
7624 
7625     // compute "maxLoc", which is a potential maximum number of locations written
7626     callRecordDrawingAndSubmit();
7627 
7628     // Take the maximum of "maxLoc" over all invocations.
7629     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7630     auto rangeLoc               = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], invocationStride);
7631     const uint32_t shaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7632     log << tcu::TestLog::Message << "computed maxLoc in shader: " << shaderMaxLoc << tcu::TestLog::EndMessage;
7633 
7634     // If we need more space, reallocate OutputB::b[] aka buffers[1]
7635     if (shaderMaxLoc > hostMaxLoc)
7636     {
7637         // Add one (to make sure no additional writes are done) and multiply by
7638         // the number of invocations and current primitive count
7639         maxLoc                = (std::max(shaderMaxLoc, hostMaxLoc) + 1u) * invocationStride;
7640         counts[OutputBallots] = maxLoc;
7641         sizes[OutputBallots]  = counts[OutputBallots] * sizeof(tcu::UVec4);
7642 
7643         if (sizes[OutputBallots] > limits.maxStorageBufferRange)
7644             TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7645 
7646         try
7647         {
7648             buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
7649                 vk, device, allocator, makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | cmnUsages),
7650                 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7651         }
7652         catch (tcu::ResourceError &)
7653         {
7654             // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7655             return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7656                                    "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
7657         }
7658         bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
7659         ptrs[OutputBallots]              = buffers[OutputBallots]->getAllocation().getHostPtr();
7660 
7661         vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
7662         setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
7663                                       VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
7664         setUpdateBuilder2.update(vk, device);
7665     }
7666 
7667     // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
7668     // Note that its size would may change since the first memory allocation
7669     deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7670     deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7671     deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
7672 
7673     // flush them all to the GPU
7674     flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7675     flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7676     flushAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
7677 
7678     // run the actual shader with updated PushConstant
7679     pc.enableInvocationIndex = VK_TRUE;
7680     callRecordDrawingAndSubmit();
7681 
7682     invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7683     const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7684     log << tcu::TestLog::Message << "final shaderMaxLoc: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
7685     if (finalShaderMaxLoc != shaderMaxLoc)
7686     {
7687         return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7688                                "maxLoc differs across shader invocations, expected: " + de::toString(shaderMaxLoc) +
7689                                    " got: " + de::toString(finalShaderMaxLoc));
7690     }
7691 
7692     invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7693     const tcu::UVec4 *ballots = static_cast<tcu::UVec4 *>(ptrs[OutputBallots]);
7694 
7695     invalidateAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
7696     auto outputPrange = makeStdBeginEnd<uint32_t>(ptrs[OutputPrimitives], counts[OutputPrimitives]);
7697     std::copy(outputPrange.first, outputPrange.second, outputP.begin());
7698 
7699     try
7700     {
7701         ref.resize(counts[OutputBallots], tcu::UVec4(0u, 0u, 0u, 0u));
7702     }
7703     catch (const std::bad_alloc &)
7704     {
7705         // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7706         return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
7707                                "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
7708     }
7709 
7710     // Simulate execution on the CPU, and compare against the GPU result
7711     const uint32_t finalHostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize,
7712                                                       fragmentStride, invocationStride, ref, log, outputP, ballots);
7713 
7714     const qpTestResult res = calculateAndLogResultEx(log, ballots, ref, finalHostMaxLoc, PrintMode::None);
7715 
7716     return tcu::TestStatus(res, qpGetTestResultName(res));
7717 }
7718 
calculateAndLogResultEx(add_ref<tcu::TestLog> log,const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,const uint32_t maxLoc,const PrintMode printMode)7719 qpTestResult_e ReconvergenceTestGeometryInstance::calculateAndLogResultEx(add_ref<tcu::TestLog> log,
7720                                                                           const tcu::UVec4 *result,
7721                                                                           const std::vector<tcu::UVec4> &ref,
7722                                                                           const uint32_t maxLoc,
7723                                                                           const PrintMode printMode)
7724 {
7725     DE_UNREF(maxLoc);
7726     DE_UNREF(printMode);
7727 
7728     qpTestResult res                  = QP_TEST_RESULT_PASS;
7729     uint32_t mismatchCount            = 0u;
7730     const uint32_t printMismatchCount = 5u;
7731 
7732     // With maximal reconvergence, we should expect the output to exactly match the reference.
7733     const uint32_t ballotStoreCount = static_cast<uint32_t>(ref.size());
7734     for (uint32_t i = 0; i < ballotStoreCount; ++i)
7735     {
7736         const Ballot resultVal(result[i], m_subgroupSize);
7737         const Ballot refVal(ref.at(i), m_subgroupSize);
7738         if (resultVal != refVal)
7739         {
7740             if (mismatchCount++ < printMismatchCount)
7741             {
7742                 res = QP_TEST_RESULT_FAIL;
7743                 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
7744                     << "\n     got: " << refVal << tcu::TestLog::EndMessage;
7745                 if (printMode == PrintMode::Console)
7746                 {
7747                     std::cout << "Mismatch at " << i << "\nexpected: " << resultVal << "\n     got: " << refVal
7748                               << std::endl;
7749                 }
7750             }
7751         }
7752     }
7753 
7754     log << tcu::TestLog::Message << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount
7755         << tcu::TestLog::EndMessage;
7756     if (printMode == PrintMode::Console)
7757     {
7758         std::cout << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount << std::endl;
7759     }
7760 
7761     return res;
7762 }
7763 
7764 void createAmberFragmentTestCases(add_ref<tcu::TestContext> testCtx, add_ptr<tcu::TestCaseGroup> group);
7765 
createTests(tcu::TestContext & testCtx,const std::string & name,bool createExperimental)7766 tcu::TestCaseGroup *createTests(tcu::TestContext &testCtx, const std::string &name, bool createExperimental)
7767 {
7768     de::MovePtr<tcu::TestCaseGroup> group(new tcu::TestCaseGroup(testCtx, name.c_str(), "reconvergence tests"));
7769 
7770     typedef struct
7771     {
7772         uint32_t value;
7773         const char *name;
7774         const char *description;
7775     } TestGroupCase;
7776 
7777     TestGroupCase ttCases[] = {
7778         {TT_SUCF_ELECT, "subgroup_uniform_control_flow_elect", "subgroup_uniform_control_flow_elect"},
7779         {TT_SUCF_BALLOT, "subgroup_uniform_control_flow_ballot", "subgroup_uniform_control_flow_ballot"},
7780         {TT_WUCF_ELECT, "workgroup_uniform_control_flow_elect", "workgroup_uniform_control_flow_elect"},
7781         {TT_WUCF_BALLOT, "workgroup_uniform_control_flow_ballot", "workgroup_uniform_control_flow_ballot"},
7782         {TT_MAXIMAL, "maximal", "maximal"},
7783     };
7784 
7785     std::pair<VkShaderStageFlagBits, const char *> const stTypes[]{
7786         {VK_SHADER_STAGE_COMPUTE_BIT, "compute"},
7787         {VK_SHADER_STAGE_FRAGMENT_BIT, "fragment"},
7788 #ifdef INCLUDE_GRAPHICS_TESTS
7789         {VK_SHADER_STAGE_VERTEX_BIT, "vertex"},
7790         {VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT, "tessctrl"},
7791         {VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, "tesseval"},
7792         {VK_SHADER_STAGE_GEOMETRY_BIT, "geometry"},
7793 #endif
7794     };
7795 
7796     for (int ttNdx = 0; ttNdx < DE_LENGTH_OF_ARRAY(ttCases); ttNdx++)
7797     {
7798         de::MovePtr<tcu::TestCaseGroup> ttGroup(
7799             new tcu::TestCaseGroup(testCtx, ttCases[ttNdx].name, ttCases[ttNdx].description));
7800 
7801         for (int stNdx = 0; stNdx < DE_LENGTH_OF_ARRAY(stTypes); ++stNdx)
7802         {
7803             // Only 'maximal' tests can process this loop when we are dealing with various kind of shaders,
7804             if (stTypes[stNdx].first != VK_SHADER_STAGE_COMPUTE_BIT && ttCases[ttNdx].value != TT_MAXIMAL)
7805                 continue;
7806 
7807             de::MovePtr<tcu::TestCaseGroup> shaderGroup(new tcu::TestCaseGroup(testCtx, stTypes[stNdx].second, ""));
7808 
7809             uint32_t nNdx = 2;
7810 
7811             if (stTypes[stNdx].first == VK_SHADER_STAGE_FRAGMENT_BIT)
7812             {
7813                 nNdx = 7;
7814                 createAmberFragmentTestCases(testCtx, shaderGroup.get());
7815             }
7816 
7817             for (/*uint32_t nNdx = 2*/; nNdx <= 6; nNdx++)
7818             {
7819                 de::MovePtr<tcu::TestCaseGroup> nestGroup(
7820                     new tcu::TestCaseGroup(testCtx, ("nesting" + de::toString(nNdx)).c_str(), ""));
7821 
7822                 uint32_t seed = 0;
7823 
7824                 for (int sNdx = 0; sNdx < 8; sNdx++)
7825                 {
7826                     de::MovePtr<tcu::TestCaseGroup> seedGroup(
7827                         new tcu::TestCaseGroup(testCtx, de::toString(sNdx).c_str(), ""));
7828 
7829                     uint32_t numTests = 0;
7830                     switch (nNdx)
7831                     {
7832                     default:
7833                         DE_ASSERT(0);
7834                         // fallthrough
7835                     case 2:
7836                     case 3:
7837                     case 4:
7838                         numTests = 250;
7839                         break;
7840                     case 5:
7841                         numTests = 100;
7842                         break;
7843                     case 6:
7844                         numTests = 50;
7845                         break;
7846                     }
7847 
7848                     if (ttCases[ttNdx].value != TT_MAXIMAL)
7849                     {
7850                         if (nNdx >= 5)
7851                             continue;
7852                     }
7853 
7854                     for (uint32_t ndx = 0; ndx < numTests; ndx++)
7855                     {
7856                         uint32_t dim = 0u;
7857                         DE_UNREF(dim);
7858                         uint32_t sizeX = 0u;
7859                         uint32_t sizeY = 0u;
7860                         switch (stTypes[stNdx].first)
7861                         {
7862                         case VK_SHADER_STAGE_COMPUTE_BIT:
7863                             // we want to test at least full subgroup
7864                             // both are primary numbers
7865                             sizeX = 7u;
7866                             sizeY = 13u;
7867                             break;
7868                         case VK_SHADER_STAGE_FRAGMENT_BIT:
7869                             sizeX = 32;
7870                             sizeY = 32;
7871                             break;
7872                         case VK_SHADER_STAGE_VERTEX_BIT:
7873                             // we want to test at least full subgroup
7874                             dim   = uint32_t(std::ceil(
7875                                 std::sqrt((double)(((128u + 31u) * 100u) / VertexRandomProgram::fillPercentage))));
7876                             sizeX = dim;
7877                             sizeY = dim;
7878                             break;
7879                         case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
7880                             sizeX = 19; // positive number of desired subgroups
7881                             sizeY = 1;  // used only for framebuffer extent in TCS test
7882                             break;
7883                         case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
7884                             sizeX = 23; // positive number of desired subgroups
7885                             sizeY = 1;  // used only for framebuffer extent in TES test
7886                             break;
7887                         case VK_SHADER_STAGE_GEOMETRY_BIT:
7888                             // we want to test at least full subgroup
7889                             dim   = uint32_t(std::ceil(
7890                                 std::sqrt((double)(((128u + 29u) * 100u) / GeometryRandomProgram::fillPercentage))));
7891                             sizeX = dim;
7892                             sizeY = dim;
7893                             break;
7894                         default:
7895                             DE_ASSERT(0);
7896                         }
7897                         CaseDef c = {
7898                             stTypes[stNdx].first,           // VkShaderStageFlagBits    shaderStage
7899                             (TestType)ttCases[ttNdx].value, // TestType testType;
7900                             nNdx,                           // uint32_t maxNesting;
7901                             seed,                           // uint32_t seed;
7902                             sizeX,                          // uint32_t sizeX;
7903                             sizeY                           // uint32_t sizeY;
7904                         };
7905                         // product of sizeX and sizeY must not exceed MAX_INVOCATIONS_ALL_TESTS
7906                         DE_ASSERT(c.verify());
7907                         seed++;
7908 
7909                         bool isExperimentalTest = (ndx >= numTests / 5);
7910 
7911                         if (createExperimental == isExperimentalTest)
7912                             seedGroup->addChild(new ReconvergenceTestCase(testCtx, de::toString(ndx).c_str(), c));
7913                     }
7914                     if (!seedGroup->empty())
7915                         nestGroup->addChild(seedGroup.release());
7916                 }
7917                 if (!nestGroup->empty())
7918                     shaderGroup->addChild(nestGroup.release());
7919             }
7920             if (!shaderGroup->empty())
7921                 ttGroup->addChild(shaderGroup.release());
7922         }
7923         group->addChild(ttGroup.release());
7924     }
7925 
7926     return group.release();
7927 }
7928 
createAmberFragmentTestCases(add_ref<tcu::TestContext> testCtx,add_ptr<tcu::TestCaseGroup> group)7929 void createAmberFragmentTestCases(add_ref<tcu::TestContext> testCtx, add_ptr<tcu::TestCaseGroup> group)
7930 {
7931     using namespace cts_amber;
7932 
7933     enum Tests
7934     {
7935         TERMINATE_INVOCATION,
7936         DEMOTE_INVOCATION,
7937         DEMOTE_ENTIRE_QUAD,
7938         DEMOTE_HALF_QUAD_TOP,
7939         DEMOTE_HALF_QUAD_RIGHT,
7940         DEMOTE_HALF_QUAD_BOTTOM,
7941         DEMOTE_HALF_QUAD_LEFT,
7942         DEMOTE_HALF_QUAD_SLASH,
7943         DEMOTE_HALF_QUAD_BACKSLASH
7944     };
7945 
7946     struct Case
7947     {
7948         Tests test;
7949         add_cptr<char> name;
7950         add_cptr<char> desc;
7951         std::size_t hname;
7952         Case(Tests aTest, add_cptr<char> aName, add_cptr<char> aDesc)
7953             : test(aTest)
7954             , name(aName)
7955             , desc(aDesc)
7956             , hname(std::hash<std::string>()(std::string(aName)))
7957         {
7958         }
7959         bool matches(add_cref<std::string> aName) const
7960         {
7961             return hname == std::hash<std::string>()(aName);
7962         }
7963         static bool matches(add_cref<std::string> aName, std::initializer_list<Case> aList)
7964         {
7965             for (auto i = aList.begin(); i != aList.end(); ++i)
7966             {
7967                 if (i->matches(aName))
7968                     return true;
7969             }
7970             return false;
7971         }
7972         std::string makeFileName() const
7973         {
7974             return (std::string(name) + ".amber");
7975         }
7976     } static const cases[]{
7977         Case(TERMINATE_INVOCATION, "terminate_invocation",
7978              "Verifies that terminated invocation is no longer included in the ballot"),
7979         Case(DEMOTE_INVOCATION, "demote_invocation",
7980              "Verifies that the demoted invocation is not present in the ballot"),
7981         Case(DEMOTE_ENTIRE_QUAD, "demote_entire_quad", "Verifies that the demoted quad is not present in the ballot"),
7982         Case(DEMOTE_HALF_QUAD_TOP, "demote_half_quad_top",
7983              "Verifies that the demoted part of the quad is not present in the ballot"),
7984         Case(DEMOTE_HALF_QUAD_RIGHT, "demote_half_quad_right",
7985              "Verifies that the demoted part of the quad is not present in the ballot"),
7986         Case(DEMOTE_HALF_QUAD_BOTTOM, "demote_half_quad_bottom",
7987              "Verifies that the demoted part of the quad is not present in the ballot"),
7988         Case(DEMOTE_HALF_QUAD_LEFT, "demote_half_quad_left",
7989              "Verifies that the demoted part of the quad is not present in the ballot"),
7990         Case(DEMOTE_HALF_QUAD_SLASH, "demote_half_quad_slash",
7991              "Verifies that the demoted part of the quad is not present in the ballot"),
7992         Case(DEMOTE_HALF_QUAD_BACKSLASH, "demote_half_quad_backslash",
7993              "Verifies that the demoted part of the quad is not present in the ballot"),
7994     };
7995 
7996     auto testSupports = [](Context &context, std::string testName) -> void
7997     {
7998         if (!(context.getSubgroupProperties().supportedStages & VK_SHADER_STAGE_FRAGMENT_BIT))
7999             TCU_THROW(NotSupportedError, "Subgroup operations not supported in fragment stage");
8000 
8001         if (!context.getShaderMaximalReconvergenceFeatures().shaderMaximalReconvergence)
8002             TCU_THROW(NotSupportedError, "shaderMaximalReconvergence not supported");
8003 
8004         if (!(context.getSubgroupProperties().supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT))
8005             TCU_THROW(NotSupportedError, "VK_SUBGROUP_FEATURE_BALLOT_BIT not supported");
8006 
8007         if (Case::matches(testName, {cases[DEMOTE_ENTIRE_QUAD]}))
8008         {
8009             if (!(context.getSubgroupProperties().subgroupSize > 4))
8010                 TCU_THROW(NotSupportedError, "subgroupSize is less than or equal to 4");
8011         }
8012         else
8013         {
8014             if (!(context.getSubgroupProperties().subgroupSize >= 4))
8015                 TCU_THROW(NotSupportedError, "subgroupSize is less than 4");
8016         }
8017 
8018         if (Case::matches(testName, {cases[TERMINATE_INVOCATION]}))
8019         {
8020             if (!context.getShaderTerminateInvocationFeatures().shaderTerminateInvocation)
8021                 TCU_THROW(NotSupportedError, "shaderTerminateInvocation not supported.");
8022         }
8023         else
8024         {
8025 #ifndef CTS_USES_VULKANSC
8026             if (!context.getShaderDemoteToHelperInvocationFeatures().shaderDemoteToHelperInvocation)
8027                 TCU_THROW(NotSupportedError, "demoteToHelperInvocation not supported.");
8028 #else
8029             if (!context.getShaderDemoteToHelperInvocationFeaturesEXT().shaderDemoteToHelperInvocation)
8030                 TCU_THROW(NotSupportedError, "demoteToHelperInvocation not supported.");
8031 #endif
8032         }
8033     };
8034 
8035     auto updateTest = [&](add_ptr<AmberTestCase> theTest) -> add_ptr<AmberTestCase>
8036     {
8037         theTest->setCheckSupportCallback(testSupports);
8038         return theTest;
8039     };
8040 
8041     const std::string testsFolder(std::string("reconvergence/maximal/") + group->getName());
8042 
8043     for (add_cref<Case> aCase : cases)
8044     {
8045         group->addChild(updateTest(
8046             createAmberTestCase(testCtx, aCase.name, aCase.desc, testsFolder.c_str(), aCase.makeFileName())));
8047     }
8048 }
8049 
8050 } // namespace
8051 
createTests(tcu::TestContext & testCtx,const std::string & name)8052 tcu::TestCaseGroup *createTests(tcu::TestContext &testCtx, const std::string &name)
8053 {
8054     return createTests(testCtx, name, false);
8055 }
8056 
createTestsExperimental(tcu::TestContext & testCtx,const std::string & name)8057 tcu::TestCaseGroup *createTestsExperimental(tcu::TestContext &testCtx, const std::string &name)
8058 {
8059     return createTests(testCtx, name, true);
8060 }
8061 
8062 } // namespace Reconvergence
8063 } // namespace vkt
8064