1 /*------------------------------------------------------------------------
2 * Vulkan Conformance Tests
3 * ------------------------
4 *
5 * Copyright (c) 2019 The Khronos Group Inc.
6 * Copyright (c) 2018-2020 NVIDIA Corporation
7 *
8 * Licensed under the Apache License, Version 2.0 (the "Licensehelper
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 *
20 * \file
21 * \brief Vulkan Reconvergence tests
22 *//*--------------------------------------------------------------------*/
23
24 #include "vktReconvergenceTests.hpp"
25
26 #include "vkBufferWithMemory.hpp"
27 #include "vkImageWithMemory.hpp"
28 #include "vkQueryUtil.hpp"
29 #include "vkBuilderUtil.hpp"
30 #include "vkCmdUtil.hpp"
31 #include "vkTypeUtil.hpp"
32 #include "vkObjUtil.hpp"
33
34 #include "vktTestGroupUtil.hpp"
35 #include "vktTestCase.hpp"
36 #include "vktAmberTestCase.hpp"
37
38 #include "deDefs.h"
39 #include "deFloat16.h"
40 #include "deMath.h"
41 #include "deRandom.h"
42 #include "deSharedPtr.hpp"
43 #include "deString.h"
44
45 #include "tcuTestCase.hpp"
46 #include "tcuTestLog.hpp"
47
48 #include <array>
49 #include <bitset>
50 #include <functional>
51 #include <map>
52 #include <numeric>
53 #include <random>
54 #include <string>
55 #include <sstream>
56 #include <set>
57 #include <type_traits>
58 #include <vector>
59 #include <memory>
60 #include <cmath>
61 #include <initializer_list>
62
63 #include <iostream>
64
65 // #define INCLUDE_GRAPHICS_TESTS
66
67 namespace vkt
68 {
69 namespace Reconvergence
70 {
71 namespace
72 {
73 using namespace vk;
74 using namespace std;
75
76 #define ARRAYSIZE(x) (sizeof(x) / sizeof(x[0]))
77 #define ROUNDUP(x__, multipler__) ((((x__) + ((multipler__)-1)) / (multipler__)) * (multipler__))
78 #define ROUNDDOWN(x__, multipler__) (((x__) / (multipler__)) * (multipler__))
79 constexpr uint32_t MAX_INVOCATIONS_ALL_TESTS = 64 * 64;
80 typedef std::bitset<MAX_INVOCATIONS_ALL_TESTS> bitset_inv_t;
81 //constexpr bitset_inv_t MAGIC_BALLOT = 0x12345678;
82
83 typedef enum
84 {
85 TT_SUCF_ELECT, // subgroup_uniform_control_flow using elect (subgroup_basic)
86 TT_SUCF_BALLOT, // subgroup_uniform_control_flow using ballot (subgroup_ballot)
87 TT_WUCF_ELECT, // workgroup uniform control flow using elect (subgroup_basic)
88 TT_WUCF_BALLOT, // workgroup uniform control flow using ballot (subgroup_ballot)
89 TT_MAXIMAL, // maximal reconvergence
90 } TestType;
91
92 static_assert(VK_TRUE == 1, "VK_TRUE must equal 1");
93
94 struct CaseDef
95 {
96 VkShaderStageFlagBits shaderStage;
97 TestType testType;
98 uint32_t maxNesting;
99 uint32_t seed;
100 // In the case of compute shader below sizes would be local_size_x and local_size_y respectively.
101 // In the case of fragment shader these sizes would define framebuffer dimensions.
102 uint32_t sizeX;
103 uint32_t sizeY;
104
isWUCFvkt::Reconvergence::__anon4f2394780111::CaseDef105 bool isWUCF() const
106 {
107 return testType == TT_WUCF_ELECT || testType == TT_WUCF_BALLOT;
108 }
isSUCFvkt::Reconvergence::__anon4f2394780111::CaseDef109 bool isSUCF() const
110 {
111 return testType == TT_SUCF_ELECT || testType == TT_SUCF_BALLOT;
112 }
isUCFvkt::Reconvergence::__anon4f2394780111::CaseDef113 bool isUCF() const
114 {
115 return isWUCF() || isSUCF();
116 }
isElectvkt::Reconvergence::__anon4f2394780111::CaseDef117 bool isElect() const
118 {
119 return testType == TT_WUCF_ELECT || testType == TT_SUCF_ELECT;
120 }
121
verifyvkt::Reconvergence::__anon4f2394780111::CaseDef122 bool verify() const
123 {
124 return (sizeX * sizeY) <= MAX_INVOCATIONS_ALL_TESTS;
125 }
126 };
127
128 template <class T, class P = T (*)[1], class R = decltype(std::begin(*std::declval<P>()))>
makeStdBeginEnd(void * p,uint32_t n)129 static auto makeStdBeginEnd(void *p, uint32_t n) -> std::pair<R, R>
130 {
131 auto tmp = std::begin(*P(p));
132 auto begin = tmp;
133 std::advance(tmp, n);
134 return {begin, tmp};
135 }
136
137 template <class R>
138 using add_ref = typename std::add_lvalue_reference<R>::type;
139 template <class R>
140 using add_cref = typename std::add_lvalue_reference<typename std::add_const<R>::type>::type;
141 template <class X>
142 using add_ptr = std::add_pointer_t<X>;
143 template <class X>
144 using add_cptr = std::add_pointer_t<std::add_const_t<X>>;
145
146 template <class RndIter>
max_element(RndIter first,RndIter last)147 RndIter max_element(RndIter first, RndIter last)
148 {
149 RndIter max = last;
150 if (first != last)
151 {
152 for (max = first, ++first; first != last; ++first)
153 {
154 if (*first > *max)
155 max = first;
156 }
157 }
158 return max;
159 }
160
161 template <class RndIter, class Selector>
max_element(RndIter first,RndIter last,Selector selector)162 RndIter max_element(RndIter first, RndIter last, Selector selector)
163 {
164 RndIter max = last;
165 if (first != last)
166 {
167 for (max = first, ++first; first != last; ++first)
168 {
169 if (selector(*first) > selector(*max))
170 max = first;
171 }
172 }
173 return max;
174 }
175
176 struct Ballot : public std::bitset<128>
177 {
178 typedef std::bitset<128> super;
Ballotvkt::Reconvergence::__anon4f2394780111::Ballot179 Ballot() : super()
180 {
181 }
Ballotvkt::Reconvergence::__anon4f2394780111::Ballot182 Ballot(add_cref<super> ballot, uint32_t printbits = 128u) : super(ballot), m_bits(printbits)
183 {
184 }
Ballotvkt::Reconvergence::__anon4f2394780111::Ballot185 Ballot(add_cref<tcu::UVec4> ballot, uint32_t printbits = 128u) : super(), m_bits(printbits)
186 {
187 *this = ballot;
188 }
Ballotvkt::Reconvergence::__anon4f2394780111::Ballot189 Ballot(uint64_t val, uint32_t printbits = 128u) : super(val), m_bits(printbits)
190 {
191 }
withSetBitvkt::Reconvergence::__anon4f2394780111::Ballot192 static Ballot withSetBit(uint32_t bit)
193 {
194 Ballot b;
195 b.set(bit);
196 return b;
197 }
sizevkt::Reconvergence::__anon4f2394780111::Ballot198 constexpr uint32_t size() const
199 {
200 return static_cast<uint32_t>(super::size());
201 }
operator tcu::UVec4vkt::Reconvergence::__anon4f2394780111::Ballot202 operator tcu::UVec4() const
203 {
204 tcu::UVec4 result;
205 super ballot(*this);
206 const super mask = 0xFFFFFFFF;
207 for (uint32_t k = 0; k < 4u; ++k)
208 {
209 result[k] = uint32_t((ballot & mask).to_ulong());
210 ballot >>= 32;
211 }
212 return result;
213 }
operator =vkt::Reconvergence::__anon4f2394780111::Ballot214 add_ref<Ballot> operator=(add_cref<tcu::UVec4> vec)
215 {
216 for (uint32_t k = 0; k < 4u; ++k)
217 {
218 (*this) <<= 32;
219 (*this) |= vec[3 - k];
220 }
221 return *this;
222 }
getwvkt::Reconvergence::__anon4f2394780111::Ballot223 DE_UNUSED_FUNCTION uint32_t getw() const
224 {
225 return m_bits;
226 }
setwvkt::Reconvergence::__anon4f2394780111::Ballot227 DE_UNUSED_FUNCTION void setw(uint32_t bits)
228 {
229 m_bits = bits;
230 }
operator <<(add_ref<std::ostream> str,add_cref<Ballot> ballot)231 DE_UNUSED_FUNCTION friend add_ref<std::ostream> operator<<(add_ref<std::ostream> str, add_cref<Ballot> ballot)
232 {
233 for (uint32_t i = 0u; i < ballot.m_bits && i < 128u; ++i)
234 {
235 str << (ballot[ballot.m_bits - i - 1u] ? '1' : '0');
236 }
237 return str;
238 }
239
240 protected:
241 uint32_t m_bits;
242 };
243
244 struct Ballots : protected std::vector<std::bitset<128>>
245 {
246 typedef std::vector<value_type> super;
247 static const constexpr uint32_t subgroupInvocationSize = static_cast<uint32_t>(value_type().size());
Ballotsvkt::Reconvergence::__anon4f2394780111::Ballots248 Ballots() : super()
249 {
250 }
Ballotsvkt::Reconvergence::__anon4f2394780111::Ballots251 explicit Ballots(uint32_t subgroupCount, add_cref<value_type> ballot = {}) : super(subgroupCount)
252 {
253 if (ballot.any())
254 *this = ballot;
255 }
Ballotsvkt::Reconvergence::__anon4f2394780111::Ballots256 Ballots(add_cref<Ballots> other) : super(upcast(other))
257 {
258 }
Ballotsvkt::Reconvergence::__anon4f2394780111::Ballots259 Ballots(Ballots &&other) : super(std::move(other))
260 {
261 }
262 using super::operator[];
263 using super::at;
264 /**
265 * @brief size method
266 * @return Returns the number of bits that the Ballots holds.
267 */
sizevkt::Reconvergence::__anon4f2394780111::Ballots268 uint32_t size() const
269 {
270 return static_cast<uint32_t>(super::size() * subgroupInvocationSize);
271 }
272 /**
273 * @brief count method
274 * @return Returns the number of bits that are set to true.
275 */
countvkt::Reconvergence::__anon4f2394780111::Ballots276 uint32_t count() const
277 {
278 uint32_t n = 0u;
279 for (add_cref<value_type> b : *this)
280 n += static_cast<uint32_t>(b.count());
281 return n;
282 }
283 /**
284 * @brief count method
285 * @return Returns the number of bits that are set to true in given subgroup.
286 */
countvkt::Reconvergence::__anon4f2394780111::Ballots287 uint32_t count(uint32_t subgroup) const
288 {
289 DE_ASSERT(subgroup < subgroupCount());
290 return static_cast<uint32_t>(at(subgroup).count());
291 }
subgroupCountvkt::Reconvergence::__anon4f2394780111::Ballots292 uint32_t subgroupCount() const
293 {
294 return static_cast<uint32_t>(super::size());
295 }
testvkt::Reconvergence::__anon4f2394780111::Ballots296 bool test(uint32_t bit) const
297 {
298 DE_ASSERT(bit < size());
299 return at(bit / subgroupInvocationSize).test(bit % subgroupInvocationSize);
300 }
setvkt::Reconvergence::__anon4f2394780111::Ballots301 bool set(uint32_t bit, bool value = true)
302 {
303 DE_ASSERT(bit <= size());
304 const bool before = test(bit);
305 at(bit / subgroupInvocationSize).set((bit % subgroupInvocationSize), value);
306 return before;
307 }
fullvkt::Reconvergence::__anon4f2394780111::Ballots308 void full()
309 {
310 const uint32_t bb = size();
311 for (uint32_t b = 0u; b < bb; ++b)
312 set(b);
313 }
setnvkt::Reconvergence::__anon4f2394780111::Ballots314 add_ref<Ballots> setn(uint32_t bits)
315 {
316 for (uint32_t i = 0u; i < bits; ++i)
317 set(i);
318 return *this;
319 }
allvkt::Reconvergence::__anon4f2394780111::Ballots320 bool all() const
321 {
322 const uint32_t gg = subgroupCount();
323 for (uint32_t g = 0u; g < gg; ++g)
324 {
325 if (false == at(g).all())
326 return false;
327 }
328 return (gg != 0u);
329 }
nonevkt::Reconvergence::__anon4f2394780111::Ballots330 bool none() const
331 {
332 const uint32_t gg = subgroupCount();
333 for (uint32_t g = 0u; g < gg; ++g)
334 {
335 if (false == at(g).none())
336 return false;
337 }
338 return (gg != 0u);
339 }
anyvkt::Reconvergence::__anon4f2394780111::Ballots340 bool any() const
341 {
342 bool res = false;
343 const uint32_t gg = subgroupCount();
344 for (uint32_t g = 0u; g < gg; ++g)
345 res |= super::at(g).any();
346 return res;
347 }
findBitvkt::Reconvergence::__anon4f2394780111::Ballots348 static uint32_t findBit(uint32_t otherFullyQualifiedInvocationID, uint32_t otherSubgroupSize)
349 {
350 return (((otherFullyQualifiedInvocationID / otherSubgroupSize) * subgroupInvocationSize) +
351 (otherFullyQualifiedInvocationID % otherSubgroupSize));
352 }
upcastvkt::Reconvergence::__anon4f2394780111::Ballots353 inline add_cref<super> upcast(add_cref<Ballots> other) const
354 {
355 return static_cast<add_cref<super>>(other);
356 }
operator &=vkt::Reconvergence::__anon4f2394780111::Ballots357 add_ref<Ballots> operator&=(add_cref<Ballots> other)
358 {
359 DE_ASSERT(subgroupCount() == other.subgroupCount());
360 const uint32_t gg = subgroupCount();
361 for (uint32_t g = 0u; g < gg; ++g)
362 super::at(g) = super::at(g) & upcast(other).at(g);
363 return *this;
364 }
operator &vkt::Reconvergence::__anon4f2394780111::Ballots365 Ballots operator&(add_cref<Ballots> other) const
366 {
367 Ballots res(*this);
368 res &= other;
369 return res;
370 }
operator |=vkt::Reconvergence::__anon4f2394780111::Ballots371 add_ref<Ballots> operator|=(add_cref<Ballots> other)
372 {
373 DE_ASSERT(subgroupCount() == other.subgroupCount());
374 const uint32_t gg = subgroupCount();
375 for (uint32_t g = 0u; g < gg; ++g)
376 super::at(g) = super::at(g) | upcast(other).at(g);
377 return *this;
378 }
operator |vkt::Reconvergence::__anon4f2394780111::Ballots379 Ballots operator|(add_cref<Ballots> other) const
380 {
381 Ballots res(*this);
382 res |= other;
383 return res;
384 }
operator <<=vkt::Reconvergence::__anon4f2394780111::Ballots385 add_ref<Ballots> operator<<=(uint32_t bits)
386 {
387 return ((*this) = ((*this) << bits));
388 }
operator <<vkt::Reconvergence::__anon4f2394780111::Ballots389 Ballots operator<<(uint32_t bits) const
390 {
391 Ballots res(subgroupCount());
392 if (bits < size() && bits != 0u)
393 {
394 for (uint32_t b = 0; b < bits; ++b)
395 res.set((b + bits), test(b));
396 }
397 return res;
398 }
operator ~vkt::Reconvergence::__anon4f2394780111::Ballots399 Ballots operator~() const
400 {
401 Ballots res(*this);
402 const uint32_t gg = subgroupCount();
403 for (uint32_t g = 0u; g < gg; ++g)
404 res.at(g) = super::at(g).operator~();
405 return res;
406 }
operator ==vkt::Reconvergence::__anon4f2394780111::Ballots407 bool operator==(add_cref<Ballots> other) const
408 {
409 if (super::size() == upcast(other).size())
410 {
411 const uint32_t gg = subgroupCount();
412 for (uint32_t g = 0u; g < gg; ++g)
413 {
414 if (at(g) != other[g])
415 return false;
416 }
417 return true;
418 }
419 return false;
420 }
operator =vkt::Reconvergence::__anon4f2394780111::Ballots421 add_ref<Ballots> operator=(add_cref<Ballots> other)
422 {
423 DE_ASSERT((subgroupCount() == other.subgroupCount()));
424 const uint32_t gg = subgroupCount();
425 for (uint32_t g = 0u; g < gg; ++g)
426 at(g) = other.at(g);
427 return *this;
428 }
operator =vkt::Reconvergence::__anon4f2394780111::Ballots429 add_ref<Ballots> operator=(add_cref<value_type> forAllGroups)
430 {
431 DE_ASSERT(super::size() >= 1u);
432 const uint32_t gg = subgroupCount();
433 for (uint32_t g = 0u; g < gg; ++g)
434 at(g) = forAllGroups;
435 return *this;
436 }
437 };
438
subgroupSizeToMask(uint32_t subgroupSize)439 uint64_t subgroupSizeToMask(uint32_t subgroupSize)
440 {
441 if (subgroupSize == 64)
442 return ~0ULL;
443 else
444 return (1ULL << subgroupSize) - 1;
445 }
446
subgroupSizeToMask(uint32_t subgroupSize,uint32_t subgroupCount)447 Ballot subgroupSizeToMask(uint32_t subgroupSize, uint32_t subgroupCount)
448 {
449 DE_UNREF(subgroupCount);
450 Ballot b;
451 DE_ASSERT(subgroupSize <= b.size());
452 for (uint32_t i = 0; i < subgroupSize; ++i)
453 b.set(i);
454 return b;
455 }
456
457 // Take a 64-bit integer, mask it to the subgroup size, and then
458 // replicate it for each subgroup
bitsetFromU64(uint64_t mask,uint32_t subgroupSize)459 bitset_inv_t bitsetFromU64(uint64_t mask, uint32_t subgroupSize)
460 {
461 mask &= subgroupSizeToMask(subgroupSize);
462 bitset_inv_t result(mask);
463 for (uint32_t i = 0; i < result.size() / subgroupSize - 1; ++i)
464 {
465 result = (result << subgroupSize) | bitset_inv_t(mask);
466 }
467 return result;
468 }
469
ballotsFromU64(uint64_t maskValue,uint32_t subgroupSize,uint32_t subgroupCount)470 Ballots ballotsFromU64(uint64_t maskValue, uint32_t subgroupSize, uint32_t subgroupCount)
471 {
472 Ballot b(maskValue);
473 b &= subgroupSizeToMask(subgroupSize, subgroupCount);
474 Ballots result(subgroupCount);
475 for (uint32_t g = 0; g < subgroupCount; ++g)
476 result.at(g) = b;
477 return result;
478 }
479
ballotsFromBallot(Ballot b,uint32_t subgroupSize,uint32_t subgroupCount)480 Ballots ballotsFromBallot(Ballot b, uint32_t subgroupSize, uint32_t subgroupCount)
481 {
482 b &= subgroupSizeToMask(subgroupSize, subgroupCount);
483 Ballots result(subgroupCount);
484 for (uint32_t g = 0; g < subgroupCount; ++g)
485 result.at(g) = b;
486 return result;
487 }
488
489 // Pick out the mask for the subgroup that invocationID is a member of
bitsetToU64(const bitset_inv_t & bitset,uint32_t subgroupSize,uint32_t invocationID)490 uint64_t bitsetToU64(const bitset_inv_t &bitset, uint32_t subgroupSize, uint32_t invocationID)
491 {
492 bitset_inv_t copy(bitset);
493 copy >>= (invocationID / subgroupSize) * subgroupSize;
494 copy &= bitset_inv_t(subgroupSizeToMask(subgroupSize));
495 uint64_t mask = copy.to_ullong();
496 mask &= subgroupSizeToMask(subgroupSize);
497 return mask;
498 }
499
500 // Pick out the mask for the subgroup that invocationID is a member of
bitsetToBallot(const Ballots & bitset,uint32_t subgroupSize,uint32_t invocationID)501 Ballot bitsetToBallot(const Ballots &bitset, uint32_t subgroupSize, uint32_t invocationID)
502 {
503 return bitset.at(invocationID / subgroupSize) & subgroupSizeToMask(subgroupSize, bitset.subgroupCount());
504 }
505
506 // Pick out the mask for the subgroup that invocationID is a member of
bitsetToBallot(add_cref<Ballots> bitset,add_cref<Ballot> subgroupSizeMask,uint32_t subgroupSize,uint32_t invocationID)507 Ballot bitsetToBallot(add_cref<Ballots> bitset, add_cref<Ballot> subgroupSizeMask, uint32_t subgroupSize,
508 uint32_t invocationID)
509 {
510 return bitset.at(invocationID / subgroupSize) & subgroupSizeMask;
511 }
512
bitsetToBallot(uint64_t value,uint32_t subgroupCount,uint32_t subgroupSize,uint32_t invocationID)513 Ballot bitsetToBallot(uint64_t value, uint32_t subgroupCount, uint32_t subgroupSize, uint32_t invocationID)
514 {
515 Ballots bs = ballotsFromU64(value, subgroupSize, subgroupCount);
516 return bitsetToBallot(bs, subgroupSize, invocationID);
517 }
518
findLSB(uint64_t value)519 static int findLSB(uint64_t value)
520 {
521 for (int i = 0; i < 64; i++)
522 {
523 if (value & (1ULL << i))
524 return i;
525 }
526 return -1;
527 }
528
529 template <uint32_t N>
findLSB(add_cref<std::bitset<N>> value)530 static uint32_t findLSB(add_cref<std::bitset<N>> value)
531 {
532 for (uint32_t i = 0u; i < N; ++i)
533 {
534 if (value.test(i))
535 return i;
536 }
537 return std::numeric_limits<uint32_t>::max();
538 }
539
540 // For each subgroup, pick out the elected invocationID, and accumulate
541 // a bitset of all of them
bitsetElect(const bitset_inv_t & value,int32_t subgroupSize)542 static bitset_inv_t bitsetElect(const bitset_inv_t &value, int32_t subgroupSize)
543 {
544 bitset_inv_t ret; // zero initialized
545
546 for (int32_t i = 0; i < (int32_t)value.size(); i += subgroupSize)
547 {
548 uint64_t mask = bitsetToU64(value, subgroupSize, i);
549 int lsb = findLSB(mask);
550 ret |= bitset_inv_t(lsb == -1 ? 0 : (1ULL << lsb)) << i;
551 }
552 return ret;
553 }
554
bitsetElect(add_cref<Ballots> value)555 static Ballots bitsetElect(add_cref<Ballots> value)
556 {
557 Ballots ret(value.subgroupCount());
558 for (uint32_t g = 0u; g < value.subgroupCount(); ++g)
559 {
560 const uint32_t lsb = findLSB<Ballots::subgroupInvocationSize>(value.at(g));
561 if (lsb != std::numeric_limits<uint32_t>::max())
562 {
563 ret.at(g).set(lsb);
564 }
565 }
566 return ret;
567 }
568
569 struct PushConstant
570 {
571 int32_t invocationStride;
572 uint32_t width;
573 uint32_t height;
574 uint32_t primitiveStride;
575 uint32_t subgroupStride;
576 uint32_t enableInvocationIndex;
577 };
578
579 struct Vertex
580 {
581 // Traditional POD structure that mimics a vertex.
582 // Be carefull before do any changes in this structure
583 // because it is strictly mapped to VK_FORMAT_R32G32B32A32_SFLOAT
584 // when graphics pipeline is constructed.
585 float x, y, z, w;
586 };
587
588 typedef Vertex Triangle[3];
589
590 class RandomProgram;
591 class ComputeRandomProgram;
592
getSubgroupProperties(vkt::Context & context)593 std::pair<vk::VkPhysicalDeviceSubgroupProperties, vk::VkPhysicalDeviceProperties2> getSubgroupProperties(
594 vkt::Context &context)
595 {
596 vk::VkPhysicalDeviceSubgroupProperties subgroupProperties;
597 deMemset(&subgroupProperties, 0, sizeof(subgroupProperties));
598 subgroupProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
599
600 vk::VkPhysicalDeviceProperties2 properties2;
601 deMemset(&properties2, 0, sizeof(properties2));
602 properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
603 properties2.pNext = &subgroupProperties;
604
605 context.getInstanceInterface().getPhysicalDeviceProperties2(context.getPhysicalDevice(), &properties2);
606
607 return {subgroupProperties, properties2};
608 }
609
610 class ReconvergenceTestInstance : public TestInstance
611 {
612 public:
613 // { vert, frag, tesc, tese, geom }; if any
614 using Shaders = std::vector<Move<VkShaderModule>>;
615
ReconvergenceTestInstance(Context & context,const CaseDef & data)616 ReconvergenceTestInstance(Context &context, const CaseDef &data)
617 : TestInstance(context)
618 , m_data(data)
619 , m_subgroupSize(getSubgroupProperties(context).first.subgroupSize)
620 {
621 }
622 ~ReconvergenceTestInstance(void) = default;
623
624 Move<VkPipeline> createComputePipeline(const VkPipelineLayout pipelineLayout, const VkShaderModule computeShader);
625 Move<VkPipeline> createGraphicsPipeline(const VkPipelineLayout pipelineLayout, const VkRenderPass renderPass,
626 const uint32_t width, const uint32_t height, const Shaders &shaders,
627 const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
628 const uint32_t patchControlPoints = 0u);
629
630 protected:
631 const CaseDef m_data;
632 const uint32_t m_subgroupSize;
633 };
634
635 class ReconvergenceTestComputeInstance : public ReconvergenceTestInstance
636 {
637 public:
ReconvergenceTestComputeInstance(Context & context,const CaseDef & data,std::shared_ptr<RandomProgram> program,std::map<uint32_t,uint32_t> && subgroupSizeToMaxLoc)638 ReconvergenceTestComputeInstance(Context &context, const CaseDef &data, std::shared_ptr<RandomProgram> program,
639 std::map<uint32_t, uint32_t> &&subgroupSizeToMaxLoc)
640 : ReconvergenceTestInstance(context, data)
641 , m_program(std::static_pointer_cast<ComputeRandomProgram>(program))
642 , m_subgroupSizeToMaxLoc(std::move(subgroupSizeToMaxLoc))
643 {
644 }
645 ~ReconvergenceTestComputeInstance(void) = default;
646
647 virtual tcu::TestStatus iterate(void) override;
648 qpTestResult_e calculateAndLogResult(const tcu::UVec4 *result, const std::vector<tcu::UVec4> &ref,
649 uint32_t invocationStride, uint32_t subgroupSize, uint32_t shaderMaxLoc);
650
651 private:
652 std::shared_ptr<ComputeRandomProgram> m_program;
653 std::map<uint32_t, uint32_t> m_subgroupSizeToMaxLoc;
654 };
655
656 class ReconvergenceTestGraphicsInstance : public ReconvergenceTestInstance
657 {
658 public:
ReconvergenceTestGraphicsInstance(Context & context,const CaseDef & data)659 ReconvergenceTestGraphicsInstance(Context &context, const CaseDef &data) : ReconvergenceTestInstance(context, data)
660 {
661 }
662 ~ReconvergenceTestGraphicsInstance(void) = default;
663
664 auto makeRenderPassBeginInfo(const VkRenderPass renderPass, const VkFramebuffer framebuffer)
665 -> VkRenderPassBeginInfo;
666 virtual auto recordDrawingAndSubmit(const VkCommandBuffer cmdBuffer, const VkPipelineLayout pipelineLayout,
667 const VkPipeline pipeline, const VkDescriptorSet descriptorSet,
668 const PushConstant &pushConstant, const VkRenderPassBeginInfo &renderPassInfo,
669 const VkBuffer vertexBuffer, const uint32_t vertexCount, const VkImage image)
670 -> void;
671 virtual auto generateVertices(const uint32_t primitiveCount, const VkPrimitiveTopology topology,
672 const uint32_t patchSize = 1) -> std::vector<tcu::Vec4>;
673 virtual auto createVertexBufferAndFlush(const std::vector<tcu::Vec4> &vertices) -> de::MovePtr<BufferWithMemory>;
674 virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
675 -> de::MovePtr<BufferWithMemory>;
676 virtual auto createShaders(void) -> Shaders = 0;
677
678 enum PrintMode
679 {
680 None,
681 ThreadsInColumns,
682 OutLocsInColumns,
683 IntuitiveThreadsOutlocs,
684 Console
685 };
686
687 virtual auto calculateAndLogResult(const uint64_t *result, const std::vector<uint64_t> &ref,
688 uint32_t invocationStride, uint32_t subgroupSize, uint32_t shaderMaxLocs,
689 uint32_t primitiveCount, PrintMode printMode) -> qpTestResult_e;
690 };
691
692 class ReconvergenceTestFragmentInstance : public ReconvergenceTestGraphicsInstance
693 {
694 struct Arrangement
695 {
696 };
697 friend class FragmentRandomProgram;
698
699 public:
ReconvergenceTestFragmentInstance(Context & context,const CaseDef & data)700 ReconvergenceTestFragmentInstance(Context &context, const CaseDef &data)
701 : ReconvergenceTestGraphicsInstance(context, data)
702 {
703 }
704 ~ReconvergenceTestFragmentInstance(void) = default;
705 virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
706 auto callAuxiliaryShader(tcu::TestStatus &status, uint32_t triangleCount) -> std::vector<uint32_t>;
707 auto makeImageCreateInfo(VkFormat format) const -> VkImageCreateInfo;
708 virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
709 -> de::MovePtr<BufferWithMemory> override;
710 virtual auto iterate(void) -> tcu::TestStatus override;
711 auto calculateAndLogResultEx(tcu::TestLog &log, const tcu::UVec4 *result, const std::vector<tcu::UVec4> &ref,
712 const uint32_t maxLoc, const Arrangement &a, const PrintMode printMode)
713 -> qpTestResult_e;
714 };
715
716 class ReconvergenceTestVertexInstance : public ReconvergenceTestGraphicsInstance
717 {
718 public:
ReconvergenceTestVertexInstance(Context & context,const CaseDef & data)719 ReconvergenceTestVertexInstance(Context &context, const CaseDef &data)
720 : ReconvergenceTestGraphicsInstance(context, data)
721 {
722 }
723 ~ReconvergenceTestVertexInstance(void) = default;
724 virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
725 virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
726 -> de::MovePtr<BufferWithMemory> override;
727
728 virtual auto iterate(void) -> tcu::TestStatus override;
729 auto calculateAndLogResultEx(add_ref<tcu::TestLog> log, const tcu::UVec4 *result,
730 const std::vector<tcu::UVec4> &ref, const uint32_t maxLoc, const PrintMode printMode)
731 -> qpTestResult_e;
732 };
733
734 class ReconvergenceTestTessCtrlInstance : public ReconvergenceTestGraphicsInstance
735 {
736 public:
ReconvergenceTestTessCtrlInstance(Context & context,const CaseDef & data)737 ReconvergenceTestTessCtrlInstance(Context &context, const CaseDef &data)
738 : ReconvergenceTestGraphicsInstance(context, data)
739 {
740 }
741 ~ReconvergenceTestTessCtrlInstance(void) = default;
742 virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
743 virtual auto iterate(void) -> tcu::TestStatus override;
744 };
745
746 class ReconvergenceTestTessEvalInstance : public ReconvergenceTestGraphicsInstance
747 {
748 public:
ReconvergenceTestTessEvalInstance(Context & context,add_cref<CaseDef> data)749 ReconvergenceTestTessEvalInstance(Context &context, add_cref<CaseDef> data)
750 : ReconvergenceTestGraphicsInstance(context, data)
751 {
752 }
753 ~ReconvergenceTestTessEvalInstance(void) = default;
754 virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
755 virtual auto iterate(void) -> tcu::TestStatus override;
756 };
757
758 class ReconvergenceTestGeometryInstance : public ReconvergenceTestGraphicsInstance
759 {
760 public:
ReconvergenceTestGeometryInstance(Context & context,add_cref<CaseDef> data)761 ReconvergenceTestGeometryInstance(Context &context, add_cref<CaseDef> data)
762 : ReconvergenceTestGraphicsInstance(context, data)
763 {
764 }
765 ~ReconvergenceTestGeometryInstance(void) = default;
766 virtual auto createShaders(void) -> std::vector<Move<VkShaderModule>> override;
767 virtual auto createVertexBufferAndFlush(uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
768 -> de::MovePtr<BufferWithMemory> override;
769
770 virtual auto iterate(void) -> tcu::TestStatus override;
771 auto calculateAndLogResultEx(add_ref<tcu::TestLog> log, const tcu::UVec4 *result,
772 const std::vector<tcu::UVec4> &ref, const uint32_t maxLoc, const PrintMode printMode)
773 -> qpTestResult_e;
774 };
775
createGraphicsPipeline(const VkPipelineLayout pipelineLayout,const VkRenderPass renderPass,const uint32_t width,const uint32_t height,const Shaders & shaders,const VkPrimitiveTopology topology,const uint32_t patchControlPoints)776 Move<VkPipeline> ReconvergenceTestInstance::createGraphicsPipeline(const VkPipelineLayout pipelineLayout,
777 const VkRenderPass renderPass, const uint32_t width,
778 const uint32_t height, const Shaders &shaders,
779 const VkPrimitiveTopology topology,
780 const uint32_t patchControlPoints)
781 {
782 const DeviceInterface &vkd = m_context.getDeviceInterface();
783 const VkDevice device = m_context.getDevice();
784 const uint32_t subpass = 0;
785
786 const std::vector<VkViewport> viewports{makeViewport(width, height)};
787 const std::vector<VkRect2D> scissors{makeRect2D(width, height)};
788
789 enum ShaderIndex
790 {
791 IVERT = 0,
792 IFRAG,
793 ITESC,
794 ITESE,
795 IGEOM
796 };
797 VkShaderModule handles[5] = {DE_NULL}; // { vert, frag, tesc, tese, geom }
798
799 for (uint32_t i = 0; i < (uint32_t)ARRAYSIZE(handles); ++i)
800 {
801 handles[i] = (i < (uint32_t)shaders.size()) ? *shaders[i] : DE_NULL;
802 }
803
804 return makeGraphicsPipeline(vkd, device, pipelineLayout, handles[IVERT], handles[ITESC], handles[ITESE],
805 handles[IGEOM], handles[IFRAG], renderPass, viewports, scissors, topology, subpass,
806 patchControlPoints);
807 }
808
createComputePipeline(const VkPipelineLayout pipelineLayout,const VkShaderModule computeShader)809 Move<VkPipeline> ReconvergenceTestInstance::createComputePipeline(const VkPipelineLayout pipelineLayout,
810 const VkShaderModule computeShader)
811 {
812 const DeviceInterface &vk = m_context.getDeviceInterface();
813 const VkDevice device = m_context.getDevice();
814
815 const uint32_t specData[2] = {m_data.sizeX, m_data.sizeY};
816 const vk::VkSpecializationMapEntry entries[DE_LENGTH_OF_ARRAY(specData)] = {
817 {0, (uint32_t)(sizeof(uint32_t) * 0), sizeof(uint32_t)},
818 {1, (uint32_t)(sizeof(uint32_t) * 1), sizeof(uint32_t)},
819 };
820 const vk::VkSpecializationInfo specInfo = {
821 DE_LENGTH_OF_ARRAY(entries), // mapEntryCount
822 entries, // pMapEntries
823 sizeof(specData), // dataSize
824 specData // pData
825 };
826
827 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroupSizeCreateInfo = {
828 VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT, // VkStructureType sType;
829 DE_NULL, // void* pNext;
830 m_subgroupSize // uint32_t requiredSubgroupSize;
831 };
832
833 const VkBool32 computeFullSubgroups =
834 m_subgroupSize <= 64 && m_context.getSubgroupSizeControlFeatures().computeFullSubgroups;
835
836 const void *shaderPNext = computeFullSubgroups ? &subgroupSizeCreateInfo : DE_NULL;
837 VkPipelineShaderStageCreateFlags pipelineShaderStageCreateFlags =
838 (VkPipelineShaderStageCreateFlags)(computeFullSubgroups ?
839 VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT :
840 0);
841
842 const VkPipelineShaderStageCreateInfo shaderCreateInfo = {
843 VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
844 shaderPNext,
845 pipelineShaderStageCreateFlags,
846 VK_SHADER_STAGE_COMPUTE_BIT, // stage
847 computeShader, // shader
848 "main",
849 &specInfo, // pSpecializationInfo
850 };
851
852 const VkComputePipelineCreateInfo pipelineCreateInfo = {
853 VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
854 DE_NULL,
855 0u, // flags
856 shaderCreateInfo, // cs
857 pipelineLayout, // layout
858 (vk::VkPipeline)0, // basePipelineHandle
859 0u, // basePipelineIndex
860 };
861
862 return vk::createComputePipeline(vk, device, DE_NULL, &pipelineCreateInfo, NULL);
863 }
864
865 typedef enum
866 {
867 // store subgroupBallot().
868 // For OP_BALLOT, OP::caseValue is initialized to zero, and then
869 // set to 1 by simulate if the ballot is not workgroup- (or subgroup-_uniform.
870 // Only workgroup-uniform ballots are validated for correctness in
871 // WUCF modes.
872 OP_BALLOT,
873
874 // store literal constant
875 OP_STORE,
876
877 // if ((1ULL << gl_SubgroupInvocationID) & mask).
878 // Special case if mask = ~0ULL, converted into "if (inputA.a[idx] == idx)"
879 OP_IF_MASK,
880 OP_ELSE_MASK,
881 OP_ENDIF,
882
883 // if (gl_SubgroupInvocationID == loopIdxN) (where N is most nested loop counter)
884 OP_IF_LOOPCOUNT,
885 OP_ELSE_LOOPCOUNT,
886
887 // if (gl_LocalInvocationIndex >= inputA.a[N]) (where N is most nested loop counter)
888 OP_IF_LOCAL_INVOCATION_INDEX,
889 OP_ELSE_LOCAL_INVOCATION_INDEX,
890
891 // break/continue
892 OP_BREAK,
893 OP_CONTINUE,
894
895 // if (subgroupElect())
896 OP_ELECT,
897
898 // Loop with uniform number of iterations (read from a buffer)
899 OP_BEGIN_FOR_UNIF,
900 OP_END_FOR_UNIF,
901
902 // for (int loopIdxN = 0; loopIdxN < gl_SubgroupInvocationID + 1; ++loopIdxN)
903 OP_BEGIN_FOR_VAR,
904 OP_END_FOR_VAR,
905
906 // for (int loopIdxN = 0;; ++loopIdxN, OP_BALLOT)
907 // Always has an "if (subgroupElect()) break;" inside.
908 // Does the equivalent of OP_BALLOT in the continue construct
909 OP_BEGIN_FOR_INF,
910 OP_END_FOR_INF,
911
912 // do { loopIdxN++; ... } while (loopIdxN < uniformValue);
913 OP_BEGIN_DO_WHILE_UNIF,
914 OP_END_DO_WHILE_UNIF,
915
916 // do { ... } while (true);
917 // Always has an "if (subgroupElect()) break;" inside
918 OP_BEGIN_DO_WHILE_INF,
919 OP_END_DO_WHILE_INF,
920
921 // return;
922 OP_RETURN,
923
924 // function call (code bracketed by these is extracted into a separate function)
925 OP_CALL_BEGIN,
926 OP_CALL_END,
927
928 // switch statement on uniform value
929 OP_SWITCH_UNIF_BEGIN,
930 // switch statement on gl_SubgroupInvocationID & 3 value
931 OP_SWITCH_VAR_BEGIN,
932 // switch statement on loopIdx value
933 OP_SWITCH_LOOP_COUNT_BEGIN,
934
935 // case statement with a (invocation mask, case mask) pair
936 OP_CASE_MASK_BEGIN,
937 // case statement used for loop counter switches, with a value and a mask of loop iterations
938 OP_CASE_LOOP_COUNT_BEGIN,
939
940 // end of switch/case statement
941 OP_SWITCH_END,
942 OP_CASE_END,
943
944 // Extra code with no functional effect. Currently inculdes:
945 // - value 0: while (!subgroupElect()) {}
946 // - value 1: if (condition_that_is_false) { infinite loop }
947 OP_NOISE,
948
949 // do nothing, only markup
950 OP_NOP
951 } OPType;
952
OPtypeToStr(const OPType op)953 const char *OPtypeToStr(const OPType op)
954 {
955 #define MAKETEXT(s__) #s__
956 #define CASETEXT(e__) \
957 case e__: \
958 return MAKETEXT(e__)
959 switch (op)
960 {
961 CASETEXT(OP_BALLOT);
962 CASETEXT(OP_STORE);
963 CASETEXT(OP_IF_MASK);
964 CASETEXT(OP_ELSE_MASK);
965 CASETEXT(OP_ENDIF);
966 CASETEXT(OP_IF_LOOPCOUNT);
967 CASETEXT(OP_ELSE_LOOPCOUNT);
968 CASETEXT(OP_IF_LOCAL_INVOCATION_INDEX);
969 CASETEXT(OP_ELSE_LOCAL_INVOCATION_INDEX);
970 CASETEXT(OP_BREAK);
971 CASETEXT(OP_CONTINUE);
972 CASETEXT(OP_ELECT);
973 CASETEXT(OP_BEGIN_FOR_UNIF);
974 CASETEXT(OP_END_FOR_UNIF);
975 CASETEXT(OP_BEGIN_FOR_VAR);
976 CASETEXT(OP_END_FOR_VAR);
977 CASETEXT(OP_BEGIN_FOR_INF);
978 CASETEXT(OP_END_FOR_INF);
979 CASETEXT(OP_BEGIN_DO_WHILE_UNIF);
980 CASETEXT(OP_END_DO_WHILE_UNIF);
981 CASETEXT(OP_BEGIN_DO_WHILE_INF);
982 CASETEXT(OP_END_DO_WHILE_INF);
983 CASETEXT(OP_RETURN);
984 CASETEXT(OP_CALL_BEGIN);
985 CASETEXT(OP_CALL_END);
986 CASETEXT(OP_SWITCH_UNIF_BEGIN);
987 CASETEXT(OP_SWITCH_VAR_BEGIN);
988 CASETEXT(OP_SWITCH_LOOP_COUNT_BEGIN);
989 CASETEXT(OP_CASE_MASK_BEGIN);
990 CASETEXT(OP_CASE_LOOP_COUNT_BEGIN);
991 CASETEXT(OP_SWITCH_END);
992 CASETEXT(OP_CASE_END);
993 CASETEXT(OP_NOISE);
994 CASETEXT(OP_NOP);
995 }
996 return "<Unknown>";
997 }
998
999 typedef enum
1000 {
1001 // Different if test conditions
1002 IF_MASK,
1003 IF_UNIFORM,
1004 IF_LOOPCOUNT,
1005 IF_LOCAL_INVOCATION_INDEX,
1006 } IFType;
1007
1008 class OP
1009 {
1010 public:
OP(OPType _type,uint64_t _value,uint32_t _caseValue=0)1011 OP(OPType _type, uint64_t _value, uint32_t _caseValue = 0)
1012 : type(_type)
1013 , value(_value)
1014 // by default, initialized only lower part with a repetition of _value
1015 , bvalue(tcu::UVec4(uint32_t(_value), uint32_t(_value >> 32), uint32_t(_value), uint32_t(_value >> 32)))
1016 , caseValue(_caseValue)
1017 {
1018 }
1019
1020 // The type of operation and an optional value.
1021 // The value could be a mask for an if test, the index of the loop
1022 // header for an end of loop, or the constant value for a store instruction
1023 OPType type;
1024 uint64_t value;
1025 Ballot bvalue;
1026 uint32_t caseValue;
1027 };
1028
1029 class RandomProgram
1030 {
1031
1032 public:
RandomProgram(const CaseDef & c,uint32_t invocationCount=0u)1033 RandomProgram(const CaseDef &c, uint32_t invocationCount = 0u)
1034 : caseDef(c)
1035 , invocationStride(invocationCount ? invocationCount : (c.sizeX * c.sizeY))
1036 , rnd()
1037 , ops()
1038 , masks()
1039 , ballotMasks()
1040 , numMasks(5)
1041 , nesting(0)
1042 , maxNesting(c.maxNesting)
1043 , loopNesting(0)
1044 , loopNestingThisFunction(0)
1045 , callNesting(0)
1046 , minCount(30)
1047 , indent(0)
1048 , isLoopInf(100, false)
1049 , doneInfLoopBreak(100, false)
1050 , storeBase(0x10000)
1051 {
1052 deRandom_init(&rnd, caseDef.seed);
1053 for (int i = 0; i < numMasks; ++i)
1054 {
1055 const uint64_t lo = deRandom_getUint64(&rnd);
1056 const uint64_t hi = deRandom_getUint64(&rnd);
1057 const tcu::UVec4 v4(uint32_t(lo), uint32_t(lo >> 32), uint32_t(hi), uint32_t(hi >> 32));
1058 ballotMasks.emplace_back(v4);
1059 masks.push_back(lo);
1060 }
1061 }
1062 virtual ~RandomProgram() = default;
1063
1064 const CaseDef caseDef;
1065 const uint32_t invocationStride;
1066 deRandom rnd;
1067 vector<OP> ops;
1068 vector<uint64_t> masks;
1069 vector<Ballot> ballotMasks;
1070 int32_t numMasks;
1071 int32_t nesting;
1072 int32_t maxNesting;
1073 int32_t loopNesting;
1074 int32_t loopNestingThisFunction;
1075 int32_t callNesting;
1076 int32_t minCount;
1077 int32_t indent;
1078 vector<bool> isLoopInf;
1079 vector<bool> doneInfLoopBreak;
1080 // Offset the value we use for OP_STORE, to avoid colliding with fully converged
1081 // active masks with small subgroup sizes (e.g. with subgroupSize == 4, the SUCF
1082 // tests need to know that 0xF is really an active mask).
1083 int32_t storeBase;
1084
genIf(IFType ifType,uint32_t maxLocalIndexCmp=0u)1085 virtual void genIf(IFType ifType, uint32_t maxLocalIndexCmp = 0u)
1086 {
1087 uint32_t maskIdx = deRandom_getUint32(&rnd) % numMasks;
1088 uint64_t mask = masks[maskIdx];
1089 Ballot bmask = ballotMasks[maskIdx];
1090 if (ifType == IF_UNIFORM)
1091 {
1092 mask = ~0ULL;
1093 bmask.set();
1094 }
1095
1096 uint32_t localIndexCmp = deRandom_getUint32(&rnd) % (maxLocalIndexCmp ? maxLocalIndexCmp : invocationStride);
1097 if (ifType == IF_LOCAL_INVOCATION_INDEX)
1098 ops.push_back({OP_IF_LOCAL_INVOCATION_INDEX, localIndexCmp});
1099 else if (ifType == IF_LOOPCOUNT)
1100 ops.push_back({OP_IF_LOOPCOUNT, 0});
1101 else
1102 {
1103 ops.push_back({OP_IF_MASK, mask});
1104 ops.back().bvalue = bmask;
1105 }
1106
1107 nesting++;
1108
1109 size_t thenBegin = ops.size();
1110 pickOP(2);
1111 size_t thenEnd = ops.size();
1112
1113 uint32_t randElse = (deRandom_getUint32(&rnd) % 100);
1114 if (randElse < 50)
1115 {
1116 if (ifType == IF_LOCAL_INVOCATION_INDEX)
1117 ops.push_back({OP_ELSE_LOCAL_INVOCATION_INDEX, localIndexCmp});
1118 else if (ifType == IF_LOOPCOUNT)
1119 ops.push_back({OP_ELSE_LOOPCOUNT, 0});
1120 else
1121 ops.push_back({OP_ELSE_MASK, 0});
1122
1123 if (randElse < 10)
1124 {
1125 // Sometimes make the else block identical to the then block
1126 for (size_t i = thenBegin; i < thenEnd; ++i)
1127 ops.push_back(ops[i]);
1128 }
1129 else
1130 pickOP(2);
1131 }
1132 ops.push_back({OP_ENDIF, 0});
1133 nesting--;
1134 }
1135
genForUnif()1136 void genForUnif()
1137 {
1138 uint32_t iterCount = (deRandom_getUint32(&rnd) % 5) + 1;
1139 ops.push_back({OP_BEGIN_FOR_UNIF, iterCount});
1140 uint32_t loopheader = (uint32_t)ops.size() - 1;
1141 nesting++;
1142 loopNesting++;
1143 loopNestingThisFunction++;
1144 pickOP(2);
1145 ops.push_back({OP_END_FOR_UNIF, loopheader});
1146 loopNestingThisFunction--;
1147 loopNesting--;
1148 nesting--;
1149 }
1150
genDoWhileUnif()1151 void genDoWhileUnif()
1152 {
1153 uint32_t iterCount = (deRandom_getUint32(&rnd) % 5) + 1;
1154 ops.push_back({OP_BEGIN_DO_WHILE_UNIF, iterCount});
1155 uint32_t loopheader = (uint32_t)ops.size() - 1;
1156 nesting++;
1157 loopNesting++;
1158 loopNestingThisFunction++;
1159 pickOP(2);
1160 ops.push_back({OP_END_DO_WHILE_UNIF, loopheader});
1161 loopNestingThisFunction--;
1162 loopNesting--;
1163 nesting--;
1164 }
1165
genForVar()1166 void genForVar()
1167 {
1168 ops.push_back({OP_BEGIN_FOR_VAR, 0});
1169 uint32_t loopheader = (uint32_t)ops.size() - 1;
1170 nesting++;
1171 loopNesting++;
1172 loopNestingThisFunction++;
1173 pickOP(2);
1174 ops.push_back({OP_END_FOR_VAR, loopheader});
1175 loopNestingThisFunction--;
1176 loopNesting--;
1177 nesting--;
1178 }
1179
genForInf()1180 void genForInf()
1181 {
1182 ops.push_back({OP_BEGIN_FOR_INF, 0});
1183 uint32_t loopheader = (uint32_t)ops.size() - 1;
1184
1185 nesting++;
1186 loopNesting++;
1187 loopNestingThisFunction++;
1188 isLoopInf[loopNesting] = true;
1189 doneInfLoopBreak[loopNesting] = false;
1190
1191 pickOP(2);
1192
1193 genElect(true);
1194 doneInfLoopBreak[loopNesting] = true;
1195
1196 pickOP(2);
1197
1198 ops.push_back({OP_END_FOR_INF, loopheader});
1199
1200 isLoopInf[loopNesting] = false;
1201 doneInfLoopBreak[loopNesting] = false;
1202 loopNestingThisFunction--;
1203 loopNesting--;
1204 nesting--;
1205 }
1206
genDoWhileInf()1207 void genDoWhileInf()
1208 {
1209 ops.push_back({OP_BEGIN_DO_WHILE_INF, 0});
1210 uint32_t loopheader = (uint32_t)ops.size() - 1;
1211
1212 nesting++;
1213 loopNesting++;
1214 loopNestingThisFunction++;
1215 isLoopInf[loopNesting] = true;
1216 doneInfLoopBreak[loopNesting] = false;
1217
1218 pickOP(2);
1219
1220 genElect(true);
1221 doneInfLoopBreak[loopNesting] = true;
1222
1223 pickOP(2);
1224
1225 ops.push_back({OP_END_DO_WHILE_INF, loopheader});
1226
1227 isLoopInf[loopNesting] = false;
1228 doneInfLoopBreak[loopNesting] = false;
1229 loopNestingThisFunction--;
1230 loopNesting--;
1231 nesting--;
1232 }
1233
genBreak()1234 void genBreak()
1235 {
1236 if (loopNestingThisFunction > 0)
1237 {
1238 // Sometimes put the break in a divergent if
1239 if ((deRandom_getUint32(&rnd) % 100) < 10)
1240 {
1241 ops.push_back({OP_IF_MASK, masks[0]});
1242 ops.back().bvalue = ballotMasks[0];
1243 ops.push_back({OP_BREAK, 0});
1244 ops.push_back({OP_ELSE_MASK, 0});
1245 ops.push_back({OP_BREAK, 0});
1246 ops.push_back({OP_ENDIF, 0});
1247 }
1248 else
1249 ops.push_back({OP_BREAK, 0});
1250 }
1251 }
1252
genContinue()1253 void genContinue()
1254 {
1255 // continues are allowed if we're in a loop and the loop is not infinite,
1256 // or if it is infinite and we've already done a subgroupElect+break.
1257 // However, adding more continues seems to reduce the failure rate, so
1258 // disable it for now
1259 if (loopNestingThisFunction > 0 && !(isLoopInf[loopNesting] /*&& !doneInfLoopBreak[loopNesting]*/))
1260 {
1261 // Sometimes put the continue in a divergent if
1262 if ((deRandom_getUint32(&rnd) % 100) < 10)
1263 {
1264 ops.push_back({OP_IF_MASK, masks[0]});
1265 ops.back().bvalue = ballotMasks[0];
1266 ops.push_back({OP_CONTINUE, 0});
1267 ops.push_back({OP_ELSE_MASK, 0});
1268 ops.push_back({OP_CONTINUE, 0});
1269 ops.push_back({OP_ENDIF, 0});
1270 }
1271 else
1272 ops.push_back({OP_CONTINUE, 0});
1273 }
1274 }
1275
1276 // doBreak is used to generate "if (subgroupElect()) { ... break; }" inside infinite loops
genElect(bool doBreak)1277 void genElect(bool doBreak)
1278 {
1279 ops.push_back({OP_ELECT, 0});
1280 nesting++;
1281 if (doBreak)
1282 {
1283 // Put something interestign before the break
1284 genBallot();
1285 genBallot();
1286 if ((deRandom_getUint32(&rnd) % 100) < 10)
1287 pickOP(1);
1288
1289 // if we're in a function, sometimes use return instead
1290 if (callNesting > 0 && (deRandom_getUint32(&rnd) % 100) < 30)
1291 ops.push_back({OP_RETURN, 0});
1292 else
1293 genBreak();
1294 }
1295 else
1296 pickOP(2);
1297
1298 ops.push_back({OP_ENDIF, 0});
1299 nesting--;
1300 }
1301
genReturn()1302 void genReturn()
1303 {
1304 uint32_t r = deRandom_getUint32(&rnd) % 100;
1305 if (nesting > 0 &&
1306 // Use return rarely in main, 20% of the time in a singly nested loop in a function
1307 // and 50% of the time in a multiply nested loop in a function
1308 (r < 5 || (callNesting > 0 && loopNestingThisFunction > 0 && r < 20) ||
1309 (callNesting > 0 && loopNestingThisFunction > 1 && r < 50)))
1310 {
1311 genBallot();
1312 if ((deRandom_getUint32(&rnd) % 100) < 10)
1313 {
1314 ops.push_back({OP_IF_MASK, masks[0]});
1315 ops.back().bvalue = ballotMasks[0];
1316 ops.push_back({OP_RETURN, 0});
1317 ops.push_back({OP_ELSE_MASK, 0});
1318 ops.push_back({OP_RETURN, 0});
1319 ops.push_back({OP_ENDIF, 0});
1320 }
1321 else
1322 ops.push_back({OP_RETURN, 0});
1323 }
1324 }
1325
1326 // Generate a function call. Save and restore some loop information, which is used to
1327 // determine when it's safe to use break/continue
genCall()1328 void genCall()
1329 {
1330 ops.push_back({OP_CALL_BEGIN, 0});
1331 callNesting++;
1332 nesting++;
1333 int32_t saveLoopNestingThisFunction = loopNestingThisFunction;
1334 loopNestingThisFunction = 0;
1335
1336 pickOP(2);
1337
1338 loopNestingThisFunction = saveLoopNestingThisFunction;
1339 nesting--;
1340 callNesting--;
1341 ops.push_back({OP_CALL_END, 0});
1342 }
1343
1344 // Generate switch on a uniform value:
1345 // switch (inputA.a[r]) {
1346 // case r+1: ... break; // should not execute
1347 // case r: ... break; // should branch uniformly
1348 // case r+2: ... break; // should not execute
1349 // }
genSwitchUnif()1350 void genSwitchUnif()
1351 {
1352 uint32_t r = deRandom_getUint32(&rnd) % 5;
1353 ops.push_back({OP_SWITCH_UNIF_BEGIN, r});
1354 nesting++;
1355
1356 ops.push_back({OP_CASE_MASK_BEGIN, 0, 1u << (r + 1)});
1357 pickOP(1);
1358 ops.push_back({OP_CASE_END, 0});
1359
1360 ops.push_back({OP_CASE_MASK_BEGIN, ~0ULL, 1u << r});
1361 ops.back().bvalue.set();
1362 pickOP(2);
1363 ops.push_back({OP_CASE_END, 0});
1364
1365 ops.push_back({OP_CASE_MASK_BEGIN, 0, 1u << (r + 2)});
1366 pickOP(1);
1367 ops.push_back({OP_CASE_END, 0});
1368
1369 ops.push_back({OP_SWITCH_END, 0});
1370 nesting--;
1371 }
1372
1373 // switch (gl_SubgroupInvocationID & 3) with four unique targets
genSwitchVar()1374 void genSwitchVar()
1375 {
1376 ops.push_back({OP_SWITCH_VAR_BEGIN, 0});
1377 nesting++;
1378
1379 ops.push_back({OP_CASE_MASK_BEGIN, 0x1111111111111111ULL, 1 << 0});
1380 ops.back().bvalue = tcu::UVec4(0x11111111);
1381 pickOP(1);
1382 ops.push_back({OP_CASE_END, 0});
1383
1384 ops.push_back({OP_CASE_MASK_BEGIN, 0x2222222222222222ULL, 1 << 1});
1385 ops.back().bvalue = tcu::UVec4(0x22222222);
1386 pickOP(1);
1387 ops.push_back({OP_CASE_END, 0});
1388
1389 ops.push_back({OP_CASE_MASK_BEGIN, 0x4444444444444444ULL, 1 << 2});
1390 ops.back().bvalue = tcu::UVec4(0x44444444);
1391 pickOP(1);
1392 ops.push_back({OP_CASE_END, 0});
1393
1394 ops.push_back({OP_CASE_MASK_BEGIN, 0x8888888888888888ULL, 1 << 3});
1395 ops.back().bvalue = tcu::UVec4(0x88888888);
1396 pickOP(1);
1397 ops.push_back({OP_CASE_END, 0});
1398
1399 ops.push_back({OP_SWITCH_END, 0});
1400 nesting--;
1401 }
1402
1403 // switch (gl_SubgroupInvocationID & 3) with two shared targets.
1404 // XXX TODO: The test considers these two targets to remain converged,
1405 // though we haven't agreed to that behavior yet.
genSwitchMulticase()1406 void genSwitchMulticase()
1407 {
1408 ops.push_back({OP_SWITCH_VAR_BEGIN, 0});
1409 nesting++;
1410
1411 ops.push_back({OP_CASE_MASK_BEGIN, 0x3333333333333333ULL, (1 << 0) | (1 << 1)});
1412 ops.back().bvalue = tcu::UVec4(0x33333333);
1413 pickOP(2);
1414 ops.push_back({OP_CASE_END, 0});
1415
1416 ops.push_back({OP_CASE_MASK_BEGIN, 0xCCCCCCCCCCCCCCCCULL, (1 << 2) | (1 << 3)});
1417 ops.back().bvalue = tcu::UVec4(0xCCCCCCCC);
1418 pickOP(2);
1419 ops.push_back({OP_CASE_END, 0});
1420
1421 ops.push_back({OP_SWITCH_END, 0});
1422 nesting--;
1423 }
1424
1425 // switch (loopIdxN) {
1426 // case 1: ... break;
1427 // case 2: ... break;
1428 // default: ... break;
1429 // }
genSwitchLoopCount()1430 void genSwitchLoopCount()
1431 {
1432 uint32_t r = deRandom_getUint32(&rnd) % loopNesting;
1433 ops.push_back({OP_SWITCH_LOOP_COUNT_BEGIN, r});
1434 nesting++;
1435
1436 ops.push_back({OP_CASE_LOOP_COUNT_BEGIN, 1ULL << 1, 1});
1437 ops.back().bvalue = tcu::UVec4(1 << 1, 0, 0, 0);
1438 pickOP(1);
1439 ops.push_back({OP_CASE_END, 0});
1440
1441 ops.push_back({OP_CASE_LOOP_COUNT_BEGIN, 1ULL << 2, 2});
1442 ops.back().bvalue = tcu::UVec4(1 << 2, 0, 0, 0);
1443 pickOP(1);
1444 ops.push_back({OP_CASE_END, 0});
1445
1446 // default:
1447 ops.push_back({OP_CASE_LOOP_COUNT_BEGIN, ~6ULL, 0xFFFFFFFF});
1448 ops.back().bvalue = tcu::UVec4(~6u, ~0u, ~0u, ~0u);
1449 pickOP(1);
1450 ops.push_back({OP_CASE_END, 0});
1451
1452 ops.push_back({OP_SWITCH_END, 0});
1453 nesting--;
1454 }
1455
pickOP(uint32_t count)1456 void pickOP(uint32_t count)
1457 {
1458 // Pick "count" instructions. These can recursively insert more instructions,
1459 // so "count" is just a seed
1460 for (uint32_t i = 0; i < count; ++i)
1461 {
1462 genBallot();
1463 if (nesting < maxNesting)
1464 {
1465 uint32_t r = deRandom_getUint32(&rnd) % 11;
1466 switch (r)
1467 {
1468 default:
1469 DE_ASSERT(0);
1470 // fallthrough
1471 case 2:
1472 if (loopNesting)
1473 {
1474 genIf(IF_LOOPCOUNT);
1475 break;
1476 }
1477 // fallthrough
1478 case 10:
1479 genIf(IF_LOCAL_INVOCATION_INDEX);
1480 break;
1481 case 0:
1482 genIf(IF_MASK);
1483 break;
1484 case 1:
1485 genIf(IF_UNIFORM);
1486 break;
1487 case 3:
1488 {
1489 // don't nest loops too deeply, to avoid extreme memory usage or timeouts
1490 if (loopNesting <= 3)
1491 {
1492 uint32_t r2 = deRandom_getUint32(&rnd) % 3;
1493 switch (r2)
1494 {
1495 default:
1496 DE_ASSERT(0); // fallthrough
1497 case 0:
1498 genForUnif();
1499 break;
1500 case 1:
1501 genForInf();
1502 break;
1503 case 2:
1504 genForVar();
1505 break;
1506 }
1507 }
1508 }
1509 break;
1510 case 4:
1511 genBreak();
1512 break;
1513 case 5:
1514 genContinue();
1515 break;
1516 case 6:
1517 genElect(false);
1518 break;
1519 case 7:
1520 {
1521 uint32_t r2 = deRandom_getUint32(&rnd) % 5;
1522 if (r2 == 0 && callNesting == 0 && nesting < maxNesting - 2)
1523 genCall();
1524 else
1525 genReturn();
1526 break;
1527 }
1528 case 8:
1529 {
1530 // don't nest loops too deeply, to avoid extreme memory usage or timeouts
1531 if (loopNesting <= 3)
1532 {
1533 uint32_t r2 = deRandom_getUint32(&rnd) % 2;
1534 switch (r2)
1535 {
1536 default:
1537 DE_ASSERT(0); // fallthrough
1538 case 0:
1539 genDoWhileUnif();
1540 break;
1541 case 1:
1542 genDoWhileInf();
1543 break;
1544 }
1545 }
1546 }
1547 break;
1548 case 9:
1549 {
1550 uint32_t r2 = deRandom_getUint32(&rnd) % 4;
1551 switch (r2)
1552 {
1553 default:
1554 DE_ASSERT(0);
1555 // fallthrough
1556 case 0:
1557 genSwitchUnif();
1558 break;
1559 case 1:
1560 if (loopNesting > 0)
1561 {
1562 genSwitchLoopCount();
1563 break;
1564 }
1565 // fallthrough
1566 case 2:
1567 if (caseDef.testType != TT_MAXIMAL)
1568 {
1569 // multicase doesn't have fully-defined behavior for MAXIMAL tests,
1570 // but does for SUCF tests
1571 genSwitchMulticase();
1572 break;
1573 }
1574 // fallthrough
1575 case 3:
1576 genSwitchVar();
1577 break;
1578 }
1579 }
1580 break;
1581 }
1582 }
1583 genBallot();
1584 }
1585 }
1586
genBallot()1587 void genBallot()
1588 {
1589 // optionally insert ballots, stores, and noise. Ballots and stores are used to determine
1590 // correctness.
1591 if ((deRandom_getUint32(&rnd) % 100) < 20)
1592 {
1593 if (ops.size() < 2 || !(ops[ops.size() - 1].type == OP_BALLOT ||
1594 (ops[ops.size() - 1].type == OP_STORE && ops[ops.size() - 2].type == OP_BALLOT)))
1595 {
1596 // do a store along with each ballot, so we can correlate where
1597 // the ballot came from
1598 if (caseDef.testType != TT_MAXIMAL)
1599 ops.push_back({OP_STORE, (uint32_t)ops.size() + storeBase});
1600 ops.push_back({OP_BALLOT, 0});
1601 }
1602 }
1603
1604 if ((deRandom_getUint32(&rnd) % 100) < 10)
1605 {
1606 if (ops.size() < 2 || !(ops[ops.size() - 1].type == OP_STORE ||
1607 (ops[ops.size() - 1].type == OP_BALLOT && ops[ops.size() - 2].type == OP_STORE)))
1608 {
1609 // SUCF does a store with every ballot. Don't bloat the code by adding more.
1610 if (caseDef.testType == TT_MAXIMAL)
1611 ops.push_back({OP_STORE, (uint32_t)ops.size() + storeBase});
1612 }
1613 }
1614
1615 uint32_t r = deRandom_getUint32(&rnd) % 10000;
1616 if (r < 3)
1617 ops.push_back({OP_NOISE, 0});
1618 else if (r < 10)
1619 ops.push_back({OP_NOISE, 1});
1620 }
1621
generateRandomProgram(qpWatchDog * watchDog,add_ref<tcu::TestLog> log)1622 std::map<uint32_t, uint32_t> generateRandomProgram(qpWatchDog *watchDog, add_ref<tcu::TestLog> log)
1623 {
1624 std::vector<tcu::UVec4> ref;
1625 std::map<uint32_t, uint32_t> subgroupSizeToMaxLoc;
1626
1627 do
1628 {
1629 ops.clear();
1630 while ((int32_t)ops.size() < minCount)
1631 pickOP(1);
1632
1633 // Retry until the program has some UCF results in it
1634 if (caseDef.isUCF())
1635 {
1636 // Simulate for all subgroup sizes, to determine whether OP_BALLOTs are nonuniform
1637 for (int32_t subgroupSize = 4; subgroupSize <= 128; subgroupSize *= 2)
1638 {
1639 //simulate(true, subgroupSize, ref);
1640 const uint32_t maxLoc = execute(watchDog, true, subgroupSize, 0u, invocationStride, ref, log);
1641 subgroupSizeToMaxLoc[subgroupSize] = maxLoc;
1642 }
1643 }
1644 } while (caseDef.isUCF() && !hasUCF());
1645
1646 return subgroupSizeToMaxLoc;
1647 }
1648
printIndent(std::stringstream & css)1649 void printIndent(std::stringstream &css)
1650 {
1651 for (int32_t i = 0; i < indent; ++i)
1652 css << " ";
1653 }
1654
1655 struct FlowState
1656 {
1657 add_cref<vector<OP>> ops;
1658 const int32_t opsIndex;
1659 const int32_t loopNesting;
1660 const int funcNum;
1661 };
1662
1663 // State of the subgroup at each level of nesting
1664 struct SubgroupState
1665 {
1666 // Currently executing
1667 bitset_inv_t activeMask;
1668 // Have executed a continue instruction in this loop
1669 bitset_inv_t continueMask;
1670 // index of the current if test or loop header
1671 uint32_t header;
1672 // number of loop iterations performed
1673 uint32_t tripCount;
1674 // is this nesting a loop?
1675 uint32_t isLoop;
1676 // is this nesting a function call?
1677 uint32_t isCall;
1678 // is this nesting a switch?
1679 uint32_t isSwitch;
1680 };
1681
1682 struct SubgroupState2
1683 {
1684 // Currently executing
1685 Ballots activeMask;
1686 // Have executed a continue instruction in this loop
1687 Ballots continueMask;
1688 // index of the current if test or loop header
1689 uint32_t header;
1690 // number of loop iterations performed
1691 uint32_t tripCount;
1692 // is this nesting a loop?
1693 uint32_t isLoop;
1694 // is this nesting a function call?
1695 uint32_t isCall;
1696 // is this nesting a switch?
1697 uint32_t isSwitch;
1698 virtual ~SubgroupState2() = default;
SubgroupState2vkt::Reconvergence::__anon4f2394780111::RandomProgram::SubgroupState21699 SubgroupState2() : SubgroupState2(0)
1700 {
1701 }
SubgroupState2vkt::Reconvergence::__anon4f2394780111::RandomProgram::SubgroupState21702 SubgroupState2(uint32_t subgroupCount)
1703 : activeMask(subgroupCount)
1704 , continueMask(subgroupCount)
1705 , header()
1706 , tripCount()
1707 , isLoop()
1708 , isCall()
1709 , isSwitch()
1710 {
1711 }
1712 };
1713
1714 struct Prerequisites
1715 {
1716 };
1717
getPartitionBallotText()1718 virtual std::string getPartitionBallotText()
1719 {
1720 return "subgroupBallot(true)";
1721 }
1722
printIfLocalInvocationIndex(std::stringstream & css,add_cref<FlowState> flow)1723 virtual void printIfLocalInvocationIndex(std::stringstream &css, add_cref<FlowState> flow)
1724 {
1725 printIndent(css);
1726 css << "if (gl_LocalInvocationIndex >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
1727 }
1728
printStore(std::stringstream & css,add_cref<FlowState> flow)1729 virtual void printStore(std::stringstream &css, add_cref<FlowState> flow)
1730 {
1731 printIndent(css);
1732 css << "outputC.loc[gl_LocalInvocationIndex]++;\n";
1733 printIndent(css);
1734 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex].x = 0x" << std::hex
1735 << flow.ops[flow.opsIndex].value << ";\n";
1736 }
1737
printBallot(std::stringstream & css,add_cref<FlowState>,bool endWithSemicolon=false)1738 virtual void printBallot(std::stringstream &css, add_cref<FlowState>, bool endWithSemicolon = false)
1739 {
1740 printIndent(css);
1741
1742 css << "outputC.loc[gl_LocalInvocationIndex]++,";
1743 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
1744 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
1745 // subgroup_uniform_control_flow, since we only validate results that must be fully
1746 // reconverged.
1747 if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
1748 {
1749 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = " << getPartitionBallotText()
1750 << ".xy";
1751 }
1752 else if (caseDef.isElect())
1753 {
1754 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex].x = elect()";
1755 }
1756 else
1757 {
1758 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = subgroupBallot(true).xy";
1759 }
1760 if (endWithSemicolon)
1761 {
1762 css << ";\n";
1763 }
1764 }
1765
printCode(std::stringstream & functions,std::stringstream & main)1766 void printCode(std::stringstream &functions, std::stringstream &main)
1767 {
1768 std::stringstream *css = &main;
1769 indent = 4;
1770 loopNesting = 0;
1771 int funcNum = 0;
1772 int32_t i = 0;
1773
1774 auto makeFlowState = [&]() -> FlowState { return FlowState{ops, i, loopNesting, funcNum}; };
1775
1776 for (; i < (int32_t)ops.size(); ++i)
1777 {
1778 switch (ops[i].type)
1779 {
1780 case OP_IF_MASK:
1781 printIndent(*css);
1782 if (ops[i].value == ~0ULL)
1783 {
1784 // This equality test will always succeed, since inputA.a[i] == i
1785 int idx = deRandom_getUint32(&rnd) % 4;
1786 *css << "if (inputA.a[" << idx << "] == " << idx << ") {\n";
1787 }
1788 else
1789 {
1790 const tcu::UVec4 v(ops[i].bvalue);
1791 *css << std::hex << "if (testBit(uvec4("
1792 << "0x" << v.x() << ", "
1793 << "0x" << v.y() << ", "
1794 << "0x" << v.z() << ", "
1795 << "0x" << v.w() << std::dec << "), gl_SubgroupInvocationID)) {\n";
1796 }
1797 indent += 4;
1798 break;
1799 case OP_IF_LOOPCOUNT:
1800 printIndent(*css);
1801 *css << "if (gl_SubgroupInvocationID == loopIdx" << loopNesting - 1 << ") {\n";
1802 indent += 4;
1803 break;
1804 case OP_IF_LOCAL_INVOCATION_INDEX:
1805 printIfLocalInvocationIndex(*css, makeFlowState());
1806 indent += 4;
1807 break;
1808 case OP_ELSE_MASK:
1809 case OP_ELSE_LOOPCOUNT:
1810 case OP_ELSE_LOCAL_INVOCATION_INDEX:
1811 indent -= 4;
1812 printIndent(*css);
1813 *css << "} else {\n";
1814 indent += 4;
1815 break;
1816 case OP_ENDIF:
1817 indent -= 4;
1818 printIndent(*css);
1819 *css << "}\n";
1820 break;
1821 case OP_BALLOT:
1822 printBallot(*css, makeFlowState(), true);
1823 break;
1824 case OP_STORE:
1825 printStore(*css, makeFlowState());
1826 break;
1827 case OP_BEGIN_FOR_VAR:
1828 printIndent(*css);
1829 *css << "for (int loopIdx" << loopNesting << " = 0;\n";
1830 printIndent(*css);
1831 *css << " loopIdx" << loopNesting << " < gl_SubgroupInvocationID + 1;\n";
1832 printIndent(*css);
1833 *css << " loopIdx" << loopNesting << "++) {\n";
1834 indent += 4;
1835 loopNesting++;
1836 break;
1837 case OP_END_FOR_VAR:
1838 loopNesting--;
1839 indent -= 4;
1840 printIndent(*css);
1841 *css << "}\n";
1842 break;
1843 case OP_BEGIN_FOR_UNIF:
1844 printIndent(*css);
1845 *css << "for (int loopIdx" << loopNesting << " = 0;\n";
1846 printIndent(*css);
1847 *css << " loopIdx" << loopNesting << " < inputA.a[" << ops[i].value << "];\n";
1848 printIndent(*css);
1849 *css << " loopIdx" << loopNesting << "++) {\n";
1850 indent += 4;
1851 loopNesting++;
1852 break;
1853 case OP_END_FOR_UNIF:
1854 loopNesting--;
1855 indent -= 4;
1856 printIndent(*css);
1857 *css << "}\n";
1858 break;
1859 case OP_BEGIN_FOR_INF:
1860 printIndent(*css);
1861 *css << "for (int loopIdx" << loopNesting << " = 0;;loopIdx" << loopNesting << "++,";
1862 loopNesting++;
1863 printBallot(*css, makeFlowState());
1864 *css << ") {\n";
1865 indent += 4;
1866 break;
1867 case OP_END_FOR_INF:
1868 loopNesting--;
1869 indent -= 4;
1870 printIndent(*css);
1871 *css << "}\n";
1872 break;
1873 case OP_BEGIN_DO_WHILE_UNIF:
1874 printIndent(*css);
1875 *css << "{\n";
1876 indent += 4;
1877 printIndent(*css);
1878 *css << "int loopIdx" << loopNesting << " = 0;\n";
1879 printIndent(*css);
1880 *css << "do {\n";
1881 indent += 4;
1882 printIndent(*css);
1883 *css << "loopIdx" << loopNesting << "++;\n";
1884 loopNesting++;
1885 break;
1886 case OP_END_DO_WHILE_UNIF:
1887 loopNesting--;
1888 indent -= 4;
1889 printIndent(*css);
1890 *css << "} while (loopIdx" << loopNesting << " < inputA.a[" << ops[(uint32_t)ops[i].value].value
1891 << "]);\n";
1892 indent -= 4;
1893 printIndent(*css);
1894 *css << "}\n";
1895 break;
1896 case OP_BEGIN_DO_WHILE_INF:
1897 printIndent(*css);
1898 *css << "{\n";
1899 indent += 4;
1900 printIndent(*css);
1901 *css << "int loopIdx" << loopNesting << " = 0;\n";
1902 printIndent(*css);
1903 *css << "do {\n";
1904 indent += 4;
1905 loopNesting++;
1906 break;
1907 case OP_END_DO_WHILE_INF:
1908 loopNesting--;
1909 printIndent(*css);
1910 *css << "loopIdx" << loopNesting << "++;\n";
1911 indent -= 4;
1912 printIndent(*css);
1913 *css << "} while (true);\n";
1914 indent -= 4;
1915 printIndent(*css);
1916 *css << "}\n";
1917 break;
1918 case OP_BREAK:
1919 printIndent(*css);
1920 *css << "break;\n";
1921 break;
1922 case OP_CONTINUE:
1923 printIndent(*css);
1924 *css << "continue;\n";
1925 break;
1926 case OP_ELECT:
1927 printIndent(*css);
1928 *css << "if (subgroupElect()) {\n";
1929 indent += 4;
1930 break;
1931 case OP_RETURN:
1932 printIndent(*css);
1933 *css << "return;\n";
1934 break;
1935 case OP_CALL_BEGIN:
1936 printIndent(*css);
1937 *css << "func" << funcNum << "(";
1938 for (int32_t n = 0; n < loopNesting; ++n)
1939 {
1940 *css << "loopIdx" << n;
1941 if (n != loopNesting - 1)
1942 *css << ", ";
1943 }
1944 *css << ");\n";
1945 css = &functions;
1946 printIndent(*css);
1947 *css << "void func" << funcNum << "(";
1948 for (int32_t n = 0; n < loopNesting; ++n)
1949 {
1950 *css << "int loopIdx" << n;
1951 if (n != loopNesting - 1)
1952 *css << ", ";
1953 }
1954 *css << ") {\n";
1955 indent += 4;
1956 funcNum++;
1957 break;
1958 case OP_CALL_END:
1959 indent -= 4;
1960 printIndent(*css);
1961 *css << "}\n";
1962 css = &main;
1963 break;
1964 case OP_NOISE:
1965 if (ops[i].value == 0)
1966 {
1967 printIndent(*css);
1968 *css << "while (!subgroupElect()) {}\n";
1969 }
1970 else
1971 {
1972 printIndent(*css);
1973 *css << "if (inputA.a[0] == 12345) {\n";
1974 indent += 4;
1975 printIndent(*css);
1976 *css << "while (true) {\n";
1977 indent += 4;
1978 printBallot(*css, makeFlowState(), true);
1979 indent -= 4;
1980 printIndent(*css);
1981 *css << "}\n";
1982 indent -= 4;
1983 printIndent(*css);
1984 *css << "}\n";
1985 }
1986 break;
1987 case OP_SWITCH_UNIF_BEGIN:
1988 printIndent(*css);
1989 *css << "switch (inputA.a[" << ops[i].value << "]) {\n";
1990 indent += 4;
1991 break;
1992 case OP_SWITCH_VAR_BEGIN:
1993 printIndent(*css);
1994 *css << "switch (gl_SubgroupInvocationID & 3) {\n";
1995 indent += 4;
1996 break;
1997 case OP_SWITCH_LOOP_COUNT_BEGIN:
1998 printIndent(*css);
1999 *css << "switch (loopIdx" << ops[i].value << ") {\n";
2000 indent += 4;
2001 break;
2002 case OP_SWITCH_END:
2003 indent -= 4;
2004 printIndent(*css);
2005 *css << "}\n";
2006 break;
2007 case OP_CASE_MASK_BEGIN:
2008 for (int32_t b = 0; b < 32; ++b)
2009 {
2010 if ((1u << b) & ops[i].caseValue)
2011 {
2012 printIndent(*css);
2013 *css << "case " << b << ":\n";
2014 }
2015 }
2016 printIndent(*css);
2017 *css << "{\n";
2018 indent += 4;
2019 break;
2020 case OP_CASE_LOOP_COUNT_BEGIN:
2021 if (ops[i].caseValue == 0xFFFFFFFF)
2022 {
2023 printIndent(*css);
2024 *css << "default: {\n";
2025 }
2026 else
2027 {
2028 printIndent(*css);
2029 *css << "case " << ops[i].caseValue << ": {\n";
2030 }
2031 indent += 4;
2032 break;
2033 case OP_CASE_END:
2034 printIndent(*css);
2035 *css << "break;\n";
2036 indent -= 4;
2037 printIndent(*css);
2038 *css << "}\n";
2039 break;
2040 default:
2041 DE_ASSERT(0);
2042 break;
2043 }
2044 }
2045 }
2046
2047 // Simulate execution of the program. If countOnly is true, just return
2048 // the max number of outputs written. If it's false, store out the result
2049 // values to ref.
2050 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) = 0;
2051
execute(qpWatchDog * watchDog,bool countOnly,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,add_cref<std::vector<uint32_t>> outputP={},const tcu::UVec4 * cmp=nullptr,const uint32_t primitiveID=(~0u))2052 virtual uint32_t execute(qpWatchDog *watchDog, bool countOnly, const uint32_t subgroupSize,
2053 const uint32_t fragmentStride, const uint32_t primitiveStride,
2054 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2055 add_cref<std::vector<uint32_t>> outputP = {}, const tcu::UVec4 *cmp = nullptr,
2056 const uint32_t primitiveID = (~0u))
2057 {
2058 // Per-invocation output location counters
2059 std::vector<uint32_t> outLoc;
2060 std::vector<SubgroupState2> stateStack;
2061 uint32_t subgroupCount;
2062 uint32_t logFailureCount;
2063 auto prerequisites = makePrerequisites(outputP, subgroupSize, fragmentStride, primitiveStride, stateStack,
2064 outLoc, subgroupCount);
2065 const Ballot fullSubgroupMask = subgroupSizeToMask(subgroupSize, subgroupCount);
2066
2067 logFailureCount = 10u;
2068 nesting = 0;
2069 loopNesting = 0;
2070
2071 int32_t i = 0;
2072 uint32_t loopCount = 0;
2073
2074 while (i < (int32_t)ops.size())
2075 {
2076 add_cref<Ballots> activeMask = stateStack[nesting].activeMask;
2077
2078 if ((loopCount % 5000) == 0 && watchDog)
2079 qpWatchDog_touch(watchDog);
2080
2081 switch (ops[i].type)
2082 {
2083 case OP_BALLOT:
2084 // Flag that this ballot is workgroup-nonuniform
2085 if (caseDef.isWUCF() && activeMask.any() && !activeMask.all())
2086 ops[i].caseValue = 1;
2087
2088 if (caseDef.isSUCF())
2089 {
2090 for (uint32_t id = 0; id < invocationStride; id += subgroupSize)
2091 {
2092 const Ballot subgroupMask = bitsetToBallot(activeMask, fullSubgroupMask, subgroupSize, id);
2093 // Flag that this ballot is subgroup-nonuniform
2094 if (subgroupMask != 0 && subgroupMask != fullSubgroupMask)
2095 ops[i].caseValue = 1;
2096 }
2097 }
2098
2099 simulateBallot(countOnly, activeMask, primitiveID, i, outLoc, ref, log, prerequisites, logFailureCount,
2100 (i > 0 ? ops[i - 1].type : OP_BALLOT), cmp);
2101 break;
2102 case OP_STORE:
2103 simulateStore(countOnly, stateStack[nesting].activeMask, primitiveID, ops[i].value, outLoc, ref, log,
2104 prerequisites, logFailureCount, (i > 0 ? ops[i - 1].type : OP_STORE), cmp);
2105 break;
2106 case OP_IF_MASK:
2107 nesting++;
2108 stateStack[nesting].activeMask =
2109 stateStack[nesting - 1].activeMask & ballotsFromBallot(ops[i].bvalue, subgroupSize, subgroupCount);
2110 stateStack[nesting].header = i;
2111 stateStack[nesting].isLoop = 0;
2112 stateStack[nesting].isSwitch = 0;
2113 break;
2114 case OP_ELSE_MASK:
2115 stateStack[nesting].activeMask =
2116 stateStack[nesting - 1].activeMask &
2117 ~ballotsFromBallot(ops[stateStack[nesting].header].bvalue, subgroupSize, subgroupCount);
2118 break;
2119 case OP_IF_LOOPCOUNT:
2120 {
2121 uint32_t n = nesting;
2122 while (!stateStack[n].isLoop)
2123 n--;
2124 const Ballot tripBallot = Ballot::withSetBit(stateStack[n].tripCount);
2125
2126 nesting++;
2127 stateStack[nesting].activeMask =
2128 stateStack[nesting - 1].activeMask & ballotsFromBallot(tripBallot, subgroupSize, subgroupCount);
2129 stateStack[nesting].header = i;
2130 stateStack[nesting].isLoop = 0;
2131 stateStack[nesting].isSwitch = 0;
2132 break;
2133 }
2134 case OP_ELSE_LOOPCOUNT:
2135 {
2136 uint32_t n = nesting;
2137 while (!stateStack[n].isLoop)
2138 n--;
2139 const Ballot tripBallot = Ballot::withSetBit(stateStack[n].tripCount);
2140
2141 stateStack[nesting].activeMask =
2142 stateStack[nesting - 1].activeMask & ~ballotsFromBallot(tripBallot, subgroupSize, subgroupCount);
2143 break;
2144 }
2145 case OP_IF_LOCAL_INVOCATION_INDEX:
2146 {
2147 // all bits >= N
2148 Ballots mask(subgroupCount);
2149 const uint32_t maxID = subgroupCount * subgroupSize;
2150 for (uint32_t id = static_cast<uint32_t>(ops[i].value); id < maxID; ++id)
2151 {
2152 mask.set(Ballots::findBit(id, subgroupSize));
2153 }
2154
2155 nesting++;
2156 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
2157 stateStack[nesting].header = i;
2158 stateStack[nesting].isLoop = 0;
2159 stateStack[nesting].isSwitch = 0;
2160 break;
2161 }
2162 case OP_ELSE_LOCAL_INVOCATION_INDEX:
2163 {
2164 // all bits < N
2165 Ballots mask(subgroupCount);
2166 const uint32_t maxID = subgroupCount * subgroupSize;
2167 for (uint32_t id = 0u; id < static_cast<uint32_t>(ops[i].value) && id < maxID; ++id)
2168 {
2169 mask.set(Ballots::findBit(id, subgroupSize));
2170 }
2171
2172 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
2173 break;
2174 }
2175 case OP_ENDIF:
2176 nesting--;
2177 break;
2178 case OP_BEGIN_FOR_UNIF:
2179 // XXX TODO: We don't handle a for loop with zero iterations
2180 nesting++;
2181 loopNesting++;
2182 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2183 stateStack[nesting].header = i;
2184 stateStack[nesting].tripCount = 0;
2185 stateStack[nesting].isLoop = 1;
2186 stateStack[nesting].isSwitch = 0;
2187 stateStack[nesting].continueMask = 0;
2188 break;
2189 case OP_END_FOR_UNIF:
2190 stateStack[nesting].tripCount++;
2191 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2192 stateStack[nesting].continueMask = 0;
2193 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
2194 stateStack[nesting].activeMask.any())
2195 {
2196 i = stateStack[nesting].header + 1;
2197 continue;
2198 }
2199 else
2200 {
2201 loopNesting--;
2202 nesting--;
2203 }
2204 break;
2205 case OP_BEGIN_DO_WHILE_UNIF:
2206 // XXX TODO: We don't handle a for loop with zero iterations
2207 nesting++;
2208 loopNesting++;
2209 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2210 stateStack[nesting].header = i;
2211 stateStack[nesting].tripCount = 1;
2212 stateStack[nesting].isLoop = 1;
2213 stateStack[nesting].isSwitch = 0;
2214 stateStack[nesting].continueMask = 0;
2215 break;
2216 case OP_END_DO_WHILE_UNIF:
2217 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2218 stateStack[nesting].continueMask = 0;
2219 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
2220 stateStack[nesting].activeMask.any())
2221 {
2222 i = stateStack[nesting].header + 1;
2223 stateStack[nesting].tripCount++;
2224 continue;
2225 }
2226 else
2227 {
2228 loopNesting--;
2229 nesting--;
2230 }
2231 break;
2232 case OP_BEGIN_FOR_VAR:
2233 // XXX TODO: We don't handle a for loop with zero iterations
2234 nesting++;
2235 loopNesting++;
2236 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2237 stateStack[nesting].header = i;
2238 stateStack[nesting].tripCount = 0;
2239 stateStack[nesting].isLoop = 1;
2240 stateStack[nesting].isSwitch = 0;
2241 stateStack[nesting].continueMask = 0;
2242 break;
2243 case OP_END_FOR_VAR:
2244 {
2245 stateStack[nesting].tripCount++;
2246 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2247 stateStack[nesting].continueMask = 0;
2248 Ballot tripBallot;
2249 if (subgroupSize != stateStack[nesting].tripCount)
2250 {
2251 for (uint32_t bit = stateStack[nesting].tripCount; bit < tripBallot.size(); ++bit)
2252 tripBallot.set(bit);
2253 }
2254 stateStack[nesting].activeMask &= ballotsFromBallot(tripBallot, subgroupSize, subgroupCount);
2255
2256 if (stateStack[nesting].activeMask.any())
2257 {
2258 i = stateStack[nesting].header + 1;
2259 continue;
2260 }
2261 else
2262 {
2263 loopNesting--;
2264 nesting--;
2265 }
2266 break;
2267 }
2268 case OP_BEGIN_FOR_INF:
2269 case OP_BEGIN_DO_WHILE_INF:
2270 nesting++;
2271 loopNesting++;
2272 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2273 stateStack[nesting].header = i;
2274 stateStack[nesting].tripCount = 0;
2275 stateStack[nesting].isLoop = 1;
2276 stateStack[nesting].isSwitch = 0;
2277 stateStack[nesting].continueMask = 0;
2278 break;
2279 case OP_END_FOR_INF:
2280 stateStack[nesting].tripCount++;
2281 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2282 stateStack[nesting].continueMask = 0;
2283 if (stateStack[nesting].activeMask.any())
2284 {
2285 // output expected OP_BALLOT values
2286 simulateBallot(countOnly, stateStack[nesting].activeMask, primitiveID, i, outLoc, ref, log,
2287 prerequisites, logFailureCount, (i > 0 ? ops[i - 1].type : OP_BALLOT), cmp);
2288
2289 i = stateStack[nesting].header + 1;
2290 continue;
2291 }
2292 else
2293 {
2294 loopNesting--;
2295 nesting--;
2296 }
2297 break;
2298 case OP_END_DO_WHILE_INF:
2299 stateStack[nesting].tripCount++;
2300 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
2301 stateStack[nesting].continueMask = 0;
2302 if (stateStack[nesting].activeMask.any())
2303 {
2304 i = stateStack[nesting].header + 1;
2305 continue;
2306 }
2307 else
2308 {
2309 loopNesting--;
2310 nesting--;
2311 }
2312 break;
2313 case OP_BREAK:
2314 {
2315 uint32_t n = nesting;
2316 const Ballots mask = stateStack[nesting].activeMask;
2317 while (true)
2318 {
2319 stateStack[n].activeMask &= ~mask;
2320 if (stateStack[n].isLoop || stateStack[n].isSwitch)
2321 break;
2322
2323 n--;
2324 }
2325 }
2326 break;
2327 case OP_CONTINUE:
2328 {
2329 uint32_t n = nesting;
2330 const Ballots mask = stateStack[nesting].activeMask;
2331 while (true)
2332 {
2333 stateStack[n].activeMask &= ~mask;
2334 if (stateStack[n].isLoop)
2335 {
2336 stateStack[n].continueMask |= mask;
2337 break;
2338 }
2339 n--;
2340 }
2341 }
2342 break;
2343 case OP_ELECT:
2344 {
2345 nesting++;
2346 stateStack[nesting].activeMask = bitsetElect(stateStack[nesting - 1].activeMask);
2347 stateStack[nesting].header = i;
2348 stateStack[nesting].isLoop = 0;
2349 stateStack[nesting].isSwitch = 0;
2350 }
2351 break;
2352 case OP_RETURN:
2353 {
2354 const Ballots mask = stateStack[nesting].activeMask;
2355 for (int32_t n = nesting; n >= 0; --n)
2356 {
2357 stateStack[n].activeMask &= ~mask;
2358 if (stateStack[n].isCall)
2359 break;
2360 }
2361 }
2362 break;
2363
2364 case OP_CALL_BEGIN:
2365 nesting++;
2366 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2367 stateStack[nesting].isLoop = 0;
2368 stateStack[nesting].isSwitch = 0;
2369 stateStack[nesting].isCall = 1;
2370 break;
2371 case OP_CALL_END:
2372 stateStack[nesting].isCall = 0;
2373 nesting--;
2374 break;
2375 case OP_NOISE:
2376 break;
2377
2378 case OP_SWITCH_UNIF_BEGIN:
2379 case OP_SWITCH_VAR_BEGIN:
2380 case OP_SWITCH_LOOP_COUNT_BEGIN:
2381 nesting++;
2382 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2383 stateStack[nesting].header = i;
2384 stateStack[nesting].isLoop = 0;
2385 stateStack[nesting].isSwitch = 1;
2386 break;
2387 case OP_SWITCH_END:
2388 nesting--;
2389 break;
2390 case OP_CASE_MASK_BEGIN:
2391 stateStack[nesting].activeMask =
2392 stateStack[nesting - 1].activeMask & ballotsFromBallot(ops[i].bvalue, subgroupSize, subgroupCount);
2393 break;
2394 case OP_CASE_LOOP_COUNT_BEGIN:
2395 {
2396 uint32_t n = nesting;
2397 uint32_t l = loopNesting;
2398
2399 while (true)
2400 {
2401 if (stateStack[n].isLoop)
2402 {
2403 l--;
2404 if (l == ops[stateStack[nesting].header].value)
2405 break;
2406 }
2407 n--;
2408 }
2409
2410 if ((Ballot::withSetBit(stateStack[n].tripCount) & Ballot(ops[i].bvalue)).any())
2411 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
2412 else
2413 stateStack[nesting].activeMask = 0;
2414 break;
2415 }
2416 case OP_CASE_END:
2417 break;
2418
2419 default:
2420 DE_ASSERT(0);
2421 break;
2422 }
2423 i++;
2424 loopCount++;
2425 }
2426 uint32_t maxLoc = 0;
2427 for (uint32_t id = 0; id < (uint32_t)outLoc.size(); ++id)
2428 maxLoc = de::max(maxLoc, outLoc[id]);
2429
2430 return maxLoc;
2431 }
2432
hasUCF() const2433 bool hasUCF() const
2434 {
2435 for (int32_t i = 0; i < (int32_t)ops.size(); ++i)
2436 {
2437 if (ops[i].type == OP_BALLOT && ops[i].caseValue == 0)
2438 return true;
2439 }
2440 return false;
2441 }
2442
2443 protected:
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)2444 virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
2445 const uint32_t subgroupSize, const uint32_t fragmentStride,
2446 const uint32_t primitiveStride,
2447 add_ref<std::vector<SubgroupState2>> stateStack,
2448 add_ref<std::vector<uint32_t>> outLoc,
2449 add_ref<uint32_t> subgroupCount)
2450 {
2451 DE_UNREF(outputP);
2452 DE_UNREF(subgroupSize);
2453 DE_UNREF(fragmentStride);
2454 DE_UNREF(primitiveStride);
2455 DE_UNREF(stateStack);
2456 DE_UNREF(outLoc);
2457 DE_UNREF(subgroupCount);
2458 return std::make_shared<Prerequisites>();
2459 }
2460
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2461 virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2462 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
2463 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2464 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2465 const OPType reason, const tcu::UVec4 *cmp)
2466 {
2467 DE_UNREF(countOnly);
2468 DE_UNREF(activeMask);
2469 DE_UNREF(primitiveID);
2470 DE_UNREF(opsIndex);
2471 DE_UNREF(outLoc);
2472 DE_UNREF(ref);
2473 DE_UNREF(log);
2474 DE_UNREF(prerequisites);
2475 DE_UNREF(logFailureCount);
2476 DE_UNREF(reason);
2477 DE_UNREF(cmp);
2478 }
2479
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2480 virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2481 const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
2482 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2483 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2484 const OPType reason, const tcu::UVec4 *cmp)
2485 {
2486 DE_UNREF(countOnly);
2487 DE_UNREF(activeMask);
2488 DE_UNREF(primitiveID);
2489 DE_UNREF(storeValue);
2490 DE_UNREF(outLoc);
2491 DE_UNREF(ref);
2492 DE_UNREF(log);
2493 DE_UNREF(prerequisites);
2494 DE_UNREF(logFailureCount);
2495 DE_UNREF(reason);
2496 DE_UNREF(cmp);
2497 }
2498 };
2499
2500 class ComputeRandomProgram : public RandomProgram
2501 {
2502 public:
ComputeRandomProgram(const CaseDef & c)2503 ComputeRandomProgram(const CaseDef &c) : RandomProgram(c, uint32_t(c.sizeX * c.sizeY))
2504 {
2505 DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_COMPUTE_BIT);
2506 }
2507 virtual ~ComputeRandomProgram() = default;
2508
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)2509 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
2510 {
2511 DE_ASSERT(false);
2512 // Do not use this method, to simulate generated program use simulate2 instead
2513 DE_UNREF(countOnly);
2514 DE_UNREF(subgroupSize);
2515 DE_UNREF(ref);
2516 return 0;
2517 }
2518
2519 struct ComputePrerequisites : Prerequisites
2520 {
2521 const uint32_t subgroupSize;
2522 const uint32_t subgroupCount;
2523 const Ballot subgroupSizeMask;
2524 std::vector<std::pair<bool, tcu::UVec4>> ballots;
ComputePrerequisitesvkt::Reconvergence::__anon4f2394780111::ComputeRandomProgram::ComputePrerequisites2525 ComputePrerequisites(uint32_t subgroupSize_, uint32_t subgroupCount_)
2526 : subgroupSize(subgroupSize_)
2527 , subgroupCount(subgroupCount_)
2528 , subgroupSizeMask(subgroupSizeToMask(subgroupSize, subgroupCount))
2529 , ballots(subgroupCount_)
2530 {
2531 }
2532 };
2533
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)2534 virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
2535 bool endWithSemicolon = false) override
2536 {
2537 printIndent(css);
2538
2539 css << "outputC.loc[gl_LocalInvocationIndex]++,";
2540 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
2541 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
2542 // subgroup_uniform_control_flow, since we only validate results that must be fully
2543 // reconverged.
2544 if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
2545 {
2546 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = " << getPartitionBallotText();
2547 }
2548 else if (caseDef.isElect())
2549 {
2550 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex].x = elect()";
2551 }
2552 else
2553 {
2554 css << "outputB.b[(outLoc++)*invocationStride + gl_LocalInvocationIndex] = subgroupBallot(true)";
2555 }
2556 if (endWithSemicolon)
2557 {
2558 css << ";\n";
2559 }
2560 }
2561
2562 protected:
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2563 virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
2564 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
2565 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2566 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2567 const OPType reason, const tcu::UVec4 *cmp) override
2568 {
2569 DE_UNREF(unusedPrimitiveID);
2570 DE_UNREF(log);
2571 DE_UNREF(logFailureCount);
2572 DE_UNREF(reason);
2573 DE_UNREF(cmp);
2574 auto pre = static_pointer_cast<ComputePrerequisites>(prerequisites);
2575 const uint32_t subgroupCount = activeMask.subgroupCount();
2576 const uint32_t subgroupSize = pre->subgroupSize;
2577
2578 std::fill_n(pre->ballots.begin(), subgroupCount, std::pair<bool, tcu::UVec4>());
2579
2580 for (uint32_t id = 0; id < invocationStride; ++id)
2581 {
2582 if (activeMask.test((Ballots::findBit(id, subgroupSize))))
2583 {
2584 if (countOnly)
2585 {
2586 outLoc[id]++;
2587 }
2588 else
2589 {
2590 if (ops[opsIndex].caseValue)
2591 {
2592 // Emit a magic value to indicate that we shouldn't validate this ballot
2593 ref[(outLoc[id]++) * invocationStride + id] =
2594 bitsetToBallot(0x12345678, subgroupCount, subgroupSize, id);
2595 }
2596 else
2597 {
2598 add_ref<std::pair<bool, tcu::UVec4>> info(pre->ballots.at(id / subgroupSize));
2599 if (false == info.first)
2600 {
2601 info.first = true;
2602 info.second = bitsetToBallot(activeMask, pre->subgroupSizeMask, subgroupSize, id);
2603 }
2604 ref[(outLoc[id]++) * invocationStride + id] = info.second;
2605 }
2606 }
2607 }
2608 }
2609 }
2610
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2611 virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
2612 const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
2613 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2614 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2615 const OPType reason, const tcu::UVec4 *cmp) override
2616 {
2617 DE_UNREF(unusedPrimitiveID);
2618 DE_UNREF(log);
2619 DE_UNREF(logFailureCount);
2620 DE_UNREF(reason);
2621 DE_UNREF(cmp);
2622 const uint32_t subgroupSize = static_pointer_cast<ComputePrerequisites>(prerequisites)->subgroupSize;
2623 for (uint32_t id = 0; id < invocationStride; ++id)
2624 {
2625 if (activeMask.test(Ballots::findBit(id, subgroupSize)))
2626 {
2627 if (countOnly)
2628 outLoc[id]++;
2629 else
2630 ref[(outLoc[id]++) * invocationStride + id][0] = uint32_t(storeValue & 0xFFFFFFFF);
2631 }
2632 }
2633 }
2634
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)2635 virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
2636 const uint32_t subgroupSize, const uint32_t fragmentStride,
2637 const uint32_t primitiveStride,
2638 add_ref<std::vector<SubgroupState2>> stateStack,
2639 add_ref<std::vector<uint32_t>> outLoc,
2640 add_ref<uint32_t> subgroupCount) override
2641 {
2642 DE_UNREF(outputP);
2643 DE_UNREF(fragmentStride);
2644 DE_ASSERT(invocationStride == primitiveStride);
2645 subgroupCount = ROUNDUP(invocationStride, subgroupSize) / subgroupSize;
2646 auto prerequisites = std::make_shared<ComputePrerequisites>(subgroupSize, subgroupCount);
2647 stateStack.resize(10u, SubgroupState2(subgroupCount));
2648 outLoc.resize(primitiveStride, 0u);
2649 add_ref<Ballots> activeMask(stateStack.at(0).activeMask);
2650 for (uint32_t id = 0; id < invocationStride; ++id)
2651 {
2652 activeMask.set(Ballots::findBit(id, subgroupSize));
2653 }
2654 return prerequisites;
2655 }
2656 };
2657
2658 class FragmentRandomProgram : public RandomProgram
2659 {
2660 public:
2661 #define BALLOT_STACK_SIZE_DEFVAL_LINE (__LINE__ + 1)
2662 static constexpr const uint32_t experimentalOutLocSize = 16384;
2663 static constexpr const uint32_t conditionIfInvocationStride = 511u;
FragmentRandomProgram(const CaseDef & c)2664 FragmentRandomProgram(const CaseDef &c) : RandomProgram(c, conditionIfInvocationStride)
2665 {
2666 DE_ASSERT(caseDef.testType == TT_MAXIMAL);
2667 DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_FRAGMENT_BIT);
2668 }
2669 virtual ~FragmentRandomProgram() = default;
2670
create(const CaseDef & c)2671 static de::MovePtr<FragmentRandomProgram> create(const CaseDef &c)
2672 {
2673 return de::MovePtr<FragmentRandomProgram>(new FragmentRandomProgram(c));
2674 }
2675
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)2676 virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
2677 {
2678 printIndent(css);
2679 css << "if (invocationIndex() >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
2680 }
2681
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)2682 virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
2683 {
2684 printIndent(css);
2685 css << "storeValue(outLoc++, 0x" << std::hex << flow.ops[flow.opsIndex].value << ");\n";
2686 }
2687
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWidthSemicolon=false)2688 virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
2689 bool endWidthSemicolon = false) override
2690 {
2691 printIndent(css);
2692 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
2693 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
2694 // subgroup_uniform_control_flow, since we only validate results that must be fully
2695 // reconverged.
2696 if (loopNesting > 0)
2697 {
2698 css << "storeBallot(outLoc++)";
2699 }
2700 else
2701 {
2702 css << getPartitionBallotText();
2703 }
2704 if (endWidthSemicolon)
2705 {
2706 css << ";\n";
2707 }
2708 }
2709
getPartitionBallotText()2710 virtual std::string getPartitionBallotText() override
2711 {
2712 return "storeBallot(outLoc++)";
2713 }
2714
genIf(IFType ifType,uint32_t maxLocalIndexCmp=0u)2715 virtual void genIf(IFType ifType, uint32_t maxLocalIndexCmp = 0u) override
2716 {
2717 DE_UNREF(maxLocalIndexCmp);
2718 RandomProgram::genIf(ifType, conditionIfInvocationStride);
2719 }
2720
2721 struct Arrangement : Prerequisites, ReconvergenceTestFragmentInstance::Arrangement
2722 {
2723 const uint32_t m_width;
2724 const uint32_t m_height;
2725 const uint32_t m_subgroupSize;
2726 const uint32_t m_fragmentStride;
2727 const uint32_t m_primitiveStride;
2728 const uint32_t m_subgroupCount;
2729 const Ballots m_initialBallots;
2730 const Ballots m_nonHelperInitialBallots;
2731 const uint32_t m_invocationStride;
2732 const std::vector<std::vector<uint32_t>> m_fragmentSubgroups;
Arrangementvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2733 Arrangement(add_cref<std::vector<uint32_t>> info, uint32_t width, uint32_t height, uint32_t subgroupSize,
2734 uint32_t primitiveStride)
2735 : m_width(width)
2736 , m_height(height)
2737 , m_subgroupSize(subgroupSize)
2738 , m_fragmentStride(width * height)
2739 , m_primitiveStride(primitiveStride)
2740 , m_subgroupCount(calcSubgroupCount(info, primitiveStride, m_fragmentStride))
2741 , m_initialBallots(makeInitialBallots(info, primitiveStride, m_fragmentStride, false))
2742 , m_nonHelperInitialBallots(makeInitialBallots(info, primitiveStride, m_fragmentStride, true))
2743 , m_invocationStride(calcInvocationStride(info, subgroupSize, primitiveStride, m_fragmentStride))
2744 , m_fragmentSubgroups(makeFragmentSubgroups(info, subgroupSize, primitiveStride, m_fragmentStride))
2745 {
2746 }
calcSubgroupCountvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2747 static uint32_t calcSubgroupCount(add_cref<std::vector<uint32_t>> info, const uint32_t primitiveStride,
2748 const uint32_t fragmentStride)
2749 {
2750 const uint32_t cc = fragmentStride * primitiveStride;
2751 std::set<uint32_t> s;
2752 uint32_t subgroupID;
2753 uint32_t subgroupInvocationID;
2754 uint32_t isHelperInvocation;
2755 for (uint32_t c = 0u; c < cc; ++c)
2756 {
2757 if (validID(info.at(c), subgroupID, subgroupInvocationID, isHelperInvocation))
2758 s.insert(subgroupID);
2759 }
2760 const uint32_t gMin = *s.begin();
2761 DE_UNREF(gMin);
2762 const uint32_t gMax = *std::next(s.begin(), (s.size() - 1u));
2763 DE_UNREF(gMax);
2764 DE_ASSERT(gMin == 0u);
2765 DE_ASSERT(gMax == (s.size() - 1u));
2766 return static_cast<uint32_t>(s.size());
2767 }
calcInvocationStridevkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2768 static uint32_t calcInvocationStride(add_cref<std::vector<uint32_t>> info, const uint32_t subgroupSize,
2769 const uint32_t primitiveStride, const uint32_t fragmentStride)
2770 {
2771 return calcSubgroupCount(info, fragmentStride, primitiveStride) * subgroupSize;
2772 }
makeInitialBallotsvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2773 static Ballots makeInitialBallots(add_cref<std::vector<uint32_t>> info, const uint32_t primitiveStride,
2774 const uint32_t fragmentStride, bool excludeHelpers)
2775 {
2776 uint32_t subgroupID;
2777 uint32_t subgroupInvocationID;
2778 uint32_t isHelperInvocation;
2779 Ballots b(calcSubgroupCount(info, fragmentStride, primitiveStride));
2780 const uint32_t cc = fragmentStride * primitiveStride;
2781 for (uint32_t c = 0u; c < cc; ++c)
2782 {
2783 if (validID(info.at(c), subgroupID, subgroupInvocationID, isHelperInvocation))
2784 {
2785 if (!(excludeHelpers && (isHelperInvocation != 0)))
2786 b.at(subgroupID).set(subgroupInvocationID);
2787 }
2788 }
2789 return b;
2790 }
2791 // Fully Qualified Invocation Name
fqinvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2792 static uint32_t fqin(uint32_t maybeHelperFragmentFQIN, add_ref<uint32_t> isHelperInvocation)
2793 {
2794 isHelperInvocation = maybeHelperFragmentFQIN >> 31;
2795 return (maybeHelperFragmentFQIN & 0x7FFFFFFF);
2796 }
makeFragmentSubgroupsvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2797 static auto makeFragmentSubgroups(add_cref<std::vector<uint32_t>> info, const uint32_t subgroupSize,
2798 const uint32_t primitiveStride, const uint32_t fragmentStride)
2799 -> std::vector<std::vector<uint32_t>>
2800 {
2801 const uint32_t subgroupCount = calcSubgroupCount(info, fragmentStride, primitiveStride);
2802 std::vector<std::vector<uint32_t>> map(primitiveStride);
2803 for (uint32_t p = 0u; p < primitiveStride; ++p)
2804 map[p].resize(fragmentStride, (subgroupCount * subgroupSize));
2805
2806 uint32_t subgroupID;
2807 uint32_t subgroupInvocationID;
2808 uint32_t isHelperInvocation;
2809 for (uint32_t p = 0u; p < primitiveStride; ++p)
2810 for (uint32_t f = 0u; f < fragmentStride; ++f)
2811 {
2812 const uint32_t sgid = info.at(f * primitiveStride + p);
2813 if (validID(sgid, subgroupID, subgroupInvocationID, isHelperInvocation))
2814 map.at(p).at(f) =
2815 (subgroupID * subgroupSize + subgroupInvocationID) | (isHelperInvocation << 31);
2816 }
2817 return map;
2818 }
calcRealInvocationCountvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2819 static uint32_t calcRealInvocationCount(add_cref<std::vector<uint32_t>> info, uint32_t primitiveStride,
2820 uint32_t fragmentStride)
2821 {
2822 const uint32_t cc = fragmentStride * primitiveStride;
2823 uint32_t n = 0u;
2824 for (uint32_t c = 0u; c < cc; ++c)
2825 {
2826 if (info[c])
2827 ++n;
2828 }
2829 return n;
2830 }
2831
2832 private:
validIDvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2833 static bool validID(const uint32_t id)
2834 {
2835 uint32_t subgroupID;
2836 DE_UNREF(subgroupID);
2837 uint32_t subgroupInvocationID;
2838 DE_UNREF(subgroupInvocationID);
2839 uint32_t isHelperInvocation;
2840 DE_UNREF(isHelperInvocation);
2841 return validID(id, subgroupID, subgroupInvocationID, isHelperInvocation);
2842 }
validIDvkt::Reconvergence::__anon4f2394780111::FragmentRandomProgram::Arrangement2843 static bool validID(const uint32_t id, add_ref<uint32_t> subgroupID, add_ref<uint32_t> subgroupInvocationID,
2844 add_ref<uint32_t> isHelperInvocation)
2845 {
2846 if (id != 0u)
2847 {
2848 subgroupInvocationID = (id & 0xFFFF);
2849 subgroupID = ((id >> 16) & 0x7FFF) - 1u;
2850 isHelperInvocation = (id >> 31);
2851 return true;
2852 }
2853 return false;
2854 }
2855 };
2856
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)2857 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
2858 {
2859 DE_ASSERT(false); // use overloaded version of simulate() instead
2860 DE_UNREF(countOnly);
2861 DE_UNREF(subgroupSize);
2862 DE_UNREF(ref);
2863 return 0;
2864 }
2865
2866 // Simulate execution of the program. If countOnly is true, just return
2867 // the max number of outputs written. If it's false, store out the result
2868 // values to ref.
execute(qpWatchDog * watchDog,bool countOnly,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,add_cref<std::vector<uint32_t>> outputP,const tcu::UVec4 * cmp=nullptr,const uint32_t reserved=(~0u))2869 virtual uint32_t execute(qpWatchDog *watchDog, bool countOnly, const uint32_t subgroupSize,
2870 const uint32_t fragmentStride, const uint32_t primitiveStride,
2871 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2872 add_cref<std::vector<uint32_t>> outputP, const tcu::UVec4 *cmp = nullptr,
2873 const uint32_t reserved = (~0u)) override
2874 {
2875 DE_UNREF(reserved);
2876 uint32_t outLocs = 0u;
2877 uint32_t maxOutLocs = 0u;
2878 for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
2879 {
2880 outLocs = RandomProgram::execute(watchDog, countOnly, subgroupSize, fragmentStride, primitiveStride, ref,
2881 log, outputP, cmp, primitiveID);
2882 maxOutLocs = std::max(outLocs, maxOutLocs);
2883 }
2884 return maxOutLocs;
2885 }
2886
2887 protected:
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2888 virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2889 const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
2890 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2891 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2892 const OPType reason, const tcu::UVec4 *cmp) override
2893 {
2894 uint32_t isHelperInvocation;
2895 add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
2896 for (const uint32_t id : a.m_fragmentSubgroups.at(primitiveID))
2897 {
2898 const uint32_t sgid = a.fqin(id, isHelperInvocation);
2899 if (sgid >= (a.m_subgroupCount * a.m_subgroupSize))
2900 continue;
2901 if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
2902 continue;
2903 const uint32_t loc = primitiveID * a.m_subgroupCount * 128 + sgid;
2904 const uint32_t index = ((outLoc.at(loc)++) * (a.m_primitiveStride * a.m_subgroupCount * 128) +
2905 (primitiveID * a.m_subgroupCount * 128) + sgid);
2906 if (false == countOnly)
2907 {
2908 ref.at(index) = tcu::UVec4(uint32_t(storeValue & 0xFFFFFFFF), 0u, 0u, 0u);
2909 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
2910 {
2911 logFailureCount -= 1u;
2912 log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
2913 << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
2914 }
2915 }
2916 }
2917 }
2918
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t primitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)2919 virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t primitiveID,
2920 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
2921 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
2922 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
2923 const OPType reason, const tcu::UVec4 *cmp) override
2924 {
2925 DE_UNREF(opsIndex);
2926 uint32_t isHelperInvocation;
2927 add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
2928 for (const uint32_t id : a.m_fragmentSubgroups.at(primitiveID))
2929 {
2930 const uint32_t sgid = a.fqin(id, isHelperInvocation);
2931 if (sgid >= (a.m_subgroupCount * a.m_subgroupSize))
2932 continue;
2933 if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
2934 continue;
2935 const uint32_t loc = primitiveID * a.m_subgroupCount * 128 + sgid;
2936 const uint32_t index = ((outLoc.at(loc)++) * (a.m_primitiveStride * a.m_subgroupCount * 128) +
2937 (primitiveID * a.m_subgroupCount * 128) + sgid);
2938 if (false == countOnly)
2939 {
2940 ref.at(index) = Ballot(activeMask.at(sgid / a.m_subgroupSize));
2941 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
2942 {
2943 logFailureCount -= 1u;
2944 log << tcu::TestLog::Message << logFailureCount << ": ballot mismatch from " << OPtypeToStr(reason)
2945 << tcu::TestLog::EndMessage;
2946 }
2947 }
2948 }
2949 }
2950
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)2951 virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
2952 const uint32_t subgroupSize, const uint32_t fragmentStride,
2953 const uint32_t primitiveStride,
2954 add_ref<std::vector<SubgroupState2>> stateStack,
2955 add_ref<std::vector<uint32_t>> outLoc,
2956 add_ref<uint32_t> subgroupCount) override
2957 {
2958 auto prerequisites = std::make_shared<Arrangement>(outputP, fragmentStride, 1u, subgroupSize, primitiveStride);
2959 subgroupCount = prerequisites->m_subgroupCount;
2960 stateStack.resize(10u, SubgroupState2(subgroupCount));
2961 outLoc.resize((subgroupCount * 128u * fragmentStride), 0u);
2962 stateStack.at(0).activeMask = prerequisites->m_initialBallots;
2963 return prerequisites;
2964 }
2965 };
2966
2967 class VertexRandomProgram : public RandomProgram
2968 {
2969 public:
2970 static const constexpr uint32_t fillPercentage = 73u;
VertexRandomProgram(add_cref<CaseDef> c)2971 VertexRandomProgram(add_cref<CaseDef> c)
2972 : RandomProgram(c,
2973 static_cast<uint32_t>(Arrangement::generatePrimitives(c.sizeX, c.sizeY, fillPercentage).size()))
2974 {
2975 DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_VERTEX_BIT);
2976 }
2977 virtual ~VertexRandomProgram() = default;
2978
2979 struct Arrangement : Prerequisites
2980 {
2981 static constexpr uint32_t NUM_SUBGROUPS_OFFSET = 0u;
2982 static constexpr uint32_t SUBGROUP_SIZE_OFFSET = 1u;
2983 static constexpr uint32_t INVOCATION_COUNT_OFFSET = 2u;
2984 static constexpr uint32_t INVOCATION_ENTRIES_OFFSET = 3u;
2985
2986 const uint32_t m_subgroupSize;
2987 const uint32_t m_primitiveStride;
2988 const uint32_t m_subgroupCount;
2989 const Ballots m_initialBallots;
2990 const uint32_t m_invocationStride;
2991 const std::vector<uint32_t> m_primitiveSubgroups;
Arrangementvkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement2992 Arrangement(add_cref<std::vector<uint32_t>> outputP, uint32_t subgroupSize, uint32_t primitiveStride)
2993 : m_subgroupSize(subgroupSize)
2994 , m_primitiveStride(primitiveStride)
2995 , m_subgroupCount(calcSubgroupCount(outputP))
2996 , m_initialBallots(makeInitialBallots(subgroupSize, primitiveStride, outputP))
2997 , m_invocationStride(primitiveStride)
2998 , m_primitiveSubgroups(makePrimitiveSubgroups(subgroupSize, primitiveStride, outputP))
2999 {
3000 }
calcSubgroupCountvkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3001 static uint32_t calcSubgroupCount(add_cref<std::vector<uint32_t>> outputP)
3002 {
3003 return outputP.at(NUM_SUBGROUPS_OFFSET);
3004 }
calcSubgroupSizevkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3005 static uint32_t calcSubgroupSize(add_cref<std::vector<uint32_t>> outputP)
3006 {
3007 return outputP.at(SUBGROUP_SIZE_OFFSET);
3008 }
calcSubgroupInvocationStridevkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3009 static uint32_t calcSubgroupInvocationStride(add_cref<std::vector<uint32_t>> outputP)
3010 {
3011 return outputP.at(INVOCATION_COUNT_OFFSET);
3012 }
makeInitialBallotsvkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3013 static Ballots makeInitialBallots(uint32_t subgroupSize, uint32_t primitiveStride,
3014 add_cref<std::vector<uint32_t>> outputP)
3015 {
3016 DE_UNREF(subgroupSize);
3017 const uint32_t subgroupCount = calcSubgroupCount(outputP);
3018 Ballots initialBallots(subgroupCount);
3019 for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
3020 {
3021 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRIES_OFFSET);
3022 if (id)
3023 {
3024 const uint32_t subgroupID = (id >> 16) - 1u;
3025 const uint32_t subgroupInvocationID = id & 0xFFFF;
3026 DE_ASSERT(subgroupID < subgroupCount);
3027 DE_ASSERT(subgroupInvocationID < subgroupSize);
3028 initialBallots.at(subgroupID).set(subgroupInvocationID);
3029 }
3030 }
3031 return initialBallots;
3032 }
makePrimitiveSubgroupsvkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3033 static std::vector<uint32_t> makePrimitiveSubgroups(uint32_t subgroupSize, uint32_t primitiveStride,
3034 add_cref<std::vector<uint32_t>> outputP)
3035 {
3036 std::vector<uint32_t> map(primitiveStride);
3037 for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
3038 {
3039 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRIES_OFFSET);
3040 if (id)
3041 {
3042 const uint32_t subgroupID = (id >> 16) - 1u;
3043 const uint32_t subgroupInvocationID = id & 0xFFFF;
3044 DE_ASSERT(subgroupInvocationID < subgroupSize);
3045 map.at(primitiveID) = subgroupID * subgroupSize + subgroupInvocationID;
3046 }
3047 }
3048 return map;
3049 }
generatePrimitivesvkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3050 static std::vector<tcu::Vec4> generatePrimitives(uint32_t width, uint32_t height, uint32_t fillPercent)
3051 {
3052 deRandom rnd;
3053 std::map<uint32_t, int> map;
3054 std::vector<tcu::Vec4> points;
3055 const uint32_t frags = (width * height);
3056 const uint32_t total = (frags * fillPercent) / 100u;
3057
3058 deRandom_init(&rnd, (width * height));
3059
3060 for (uint32_t i = 0u; i < total; ++i)
3061 {
3062 const uint32_t r = deRandom_getUint32(&rnd) % frags;
3063 if (map[r] != 0)
3064 {
3065 i -= 1;
3066 continue;
3067 }
3068 map[r] = 1;
3069
3070 uint32_t y = r / width;
3071 uint32_t x = r % width;
3072 float xx = (float(x) + float(x + 1)) / (2.0f * float(width));
3073 float yy = (float(y) + float(y + 1)) / (2.0f * float(height));
3074 float xxx = xx * 2.0f - 1.0f;
3075 float yyy = yy * 2.0f - 1.0f;
3076 points.emplace_back(tcu::Vec4(xxx, yyy, 0u, 0u));
3077 }
3078 return points;
3079 }
generateOutputPvectorvkt::Reconvergence::__anon4f2394780111::VertexRandomProgram::Arrangement3080 static std::vector<uint32_t> generateOutputPvector(uint32_t subgroupSize, uint32_t vertexCount)
3081 {
3082 const uint32_t subgroupCount = ROUNDUP(vertexCount, subgroupSize) / subgroupSize;
3083 std::vector<uint32_t> outputP(vertexCount + INVOCATION_ENTRIES_OFFSET);
3084 outputP.at(NUM_SUBGROUPS_OFFSET) = subgroupCount;
3085 outputP.at(SUBGROUP_SIZE_OFFSET) = subgroupSize;
3086 outputP.at(INVOCATION_COUNT_OFFSET) = vertexCount;
3087 for (uint32_t vertexID = 0u; vertexID < vertexCount; ++vertexID)
3088 {
3089 const uint32_t subgroupID = vertexID / subgroupSize;
3090 const uint32_t subgroupInvocationID = vertexID % subgroupSize;
3091 outputP.at(vertexID + INVOCATION_ENTRIES_OFFSET) = ((subgroupID + 1u) << 16) | subgroupInvocationID;
3092 }
3093 return outputP;
3094 }
3095 };
3096
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)3097 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
3098 {
3099 DE_ASSERT(false); // use overloaded version of simulate() instead
3100 DE_UNREF(countOnly);
3101 DE_UNREF(subgroupSize);
3102 DE_UNREF(ref);
3103 return 0;
3104 }
3105
3106 protected:
genIf(IFType ifType,uint32_t)3107 virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
3108 {
3109 RandomProgram::genIf(ifType, RandomProgram::invocationStride);
3110 }
3111
getPartitionBallotText()3112 virtual std::string getPartitionBallotText() override
3113 {
3114 return "storeValue(outLoc++, subgroupBallot(true))";
3115 }
3116
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)3117 virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3118 {
3119 printIndent(css);
3120 css << "if (invocationIndex() >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
3121 }
3122
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)3123 virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3124 {
3125 printIndent(css);
3126 css << "storeValue(outLoc++, 0x" << std::hex << flow.ops[flow.opsIndex].value << std::dec << ");\n";
3127 }
3128
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)3129 virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
3130 bool endWithSemicolon = false) override
3131 {
3132 printIndent(css);
3133 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
3134 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
3135 // subgroup_uniform_control_flow, since we only validate results that must be fully
3136 // reconverged.
3137 if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
3138 {
3139 css << getPartitionBallotText();
3140 }
3141 else
3142 {
3143 css << "storeValue(outLoc++, subgroupBallot(true))";
3144 }
3145 if (endWithSemicolon)
3146 {
3147 css << ";\n";
3148 }
3149 }
3150
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)3151 virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
3152 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
3153 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
3154 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
3155 const OPType reason, const tcu::UVec4 *cmp) override
3156 {
3157 DE_UNREF(unusedPrimitiveID);
3158 DE_UNREF(opsIndex);
3159 add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
3160 for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
3161 {
3162 const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
3163 DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
3164 if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
3165 continue;
3166 const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
3167 if (false == countOnly)
3168 {
3169 ref.at(index) = Ballot(activeMask.at(sgid / a.m_subgroupSize));
3170 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
3171 {
3172 logFailureCount -= 1u;
3173 log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
3174 << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
3175 }
3176 }
3177 }
3178 }
3179
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)3180 virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
3181 const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
3182 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
3183 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
3184 const OPType reason, const tcu::UVec4 *cmp) override
3185 {
3186 DE_UNREF(unusedPrimitiveID);
3187 add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
3188 for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
3189 {
3190 const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
3191 DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
3192 if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
3193 continue;
3194 const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
3195 if (false == countOnly)
3196 {
3197 ref.at(index) = Ballot(tcu::UVec4(uint32_t(storeValue & 0xFFFFFFFF), 0u, 0u, 0u));
3198 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
3199 {
3200 logFailureCount -= 1u;
3201 log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
3202 << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
3203 }
3204 }
3205 }
3206 }
3207
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)3208 virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
3209 const uint32_t subgroupSize, const uint32_t fragmentStride,
3210 const uint32_t primitiveStride,
3211 add_ref<std::vector<SubgroupState2>> stateStack,
3212 add_ref<std::vector<uint32_t>> outLoc,
3213 add_ref<uint32_t> subgroupCount) override
3214 {
3215 DE_UNREF(fragmentStride);
3216 auto prerequisites = std::make_shared<Arrangement>(outputP, subgroupSize, primitiveStride);
3217 subgroupCount = prerequisites->m_subgroupCount;
3218 stateStack.resize(10u, SubgroupState2(subgroupCount));
3219 outLoc.resize(primitiveStride, 0u);
3220 stateStack.at(0).activeMask = prerequisites->m_initialBallots;
3221 return prerequisites;
3222 }
3223 };
3224
3225 class TessCtrlRandomProgram : public RandomProgram
3226 {
3227 public:
TessCtrlRandomProgram(add_cref<CaseDef> c,uint32_t invocationCount)3228 TessCtrlRandomProgram(add_cref<CaseDef> c, uint32_t invocationCount) : RandomProgram(c, invocationCount)
3229 {
3230 DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT);
3231 }
3232 virtual ~TessCtrlRandomProgram() = default;
3233
3234 static const uint32_t minSubgroupSize = 4;
3235
genIf(IFType ifType,uint32_t)3236 virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
3237 {
3238 RandomProgram::genIf(ifType, std::min((minSubgroupSize * caseDef.sizeX), 64u));
3239 }
3240
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)3241 virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3242 {
3243 printIndent(css);
3244 css << "if (";
3245 css << "((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) + gl_SubgroupInvocationID)";
3246 css << " >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
3247 }
3248
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)3249 virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
3250 {
3251 printIndent(css);
3252 css << "outputC.loc[invocationIndex()]++;\n";
3253 printIndent(css);
3254 css << "outputB.b[(outLoc++) * invocationStride + invocationIndex()].x = 0x" << std::hex
3255 << flow.ops[flow.opsIndex].value << ";\n";
3256 }
3257
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)3258 virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
3259 bool endWithSemicolon = false) override
3260 {
3261 printIndent(css);
3262
3263 css << "outputC.loc[invocationIndex()]++,";
3264 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
3265 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
3266 // subgroup_uniform_control_flow, since we only validate results that must be fully
3267 // reconverged.
3268 if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
3269 {
3270 css << "outputB.b[(outLoc++) * invocationStride + invocationIndex()] = " << getPartitionBallotText()
3271 << ".xy";
3272 }
3273 else
3274 {
3275 css << "outputB.b[(outLoc++) * invocationStride + invocationIndex()] = subgroupBallot(true).xy";
3276 }
3277 if (endWithSemicolon)
3278 {
3279 css << ";\n";
3280 }
3281 }
3282
simulateStoreToChange(bool countOnly,uint32_t,const SubgroupState (& stateStack)[10],int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)3283 void simulateStoreToChange(bool countOnly, uint32_t /*subgroupSize*/, const SubgroupState (&stateStack)[10],
3284 int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
3285 add_ref<std::vector<uint64_t>> ref)
3286 {
3287 for (uint32_t id = 0; id < invocationStride; ++id)
3288 {
3289 if (stateStack[nesting].activeMask.test(id))
3290 {
3291 if (countOnly)
3292 outLoc[id]++;
3293 else
3294 ref[(outLoc[id]++) * invocationStride + id] = ops[opsIndex].value;
3295 }
3296 }
3297 }
3298
simulateBallotToChange(bool countOnly,uint32_t subgroupSize,const SubgroupState (& stateStack)[10],uint32_t,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)3299 void simulateBallotToChange(bool countOnly, uint32_t subgroupSize, const SubgroupState (&stateStack)[10],
3300 uint32_t /*opsIndex*/, add_ref<std::vector<uint32_t>> outLoc,
3301 add_ref<std::vector<uint64_t>> ref)
3302 {
3303 for (uint32_t id = 0; id < invocationStride; ++id)
3304 {
3305 if (stateStack[nesting].activeMask.test(id))
3306 {
3307 if (countOnly)
3308 outLoc[id]++;
3309 else
3310 ref[(outLoc[id]++) * invocationStride + id] =
3311 bitsetToU64(stateStack[nesting].activeMask, subgroupSize, id);
3312 }
3313 }
3314 }
3315
3316 // Simulate execution of the program. If countOnly is true, just return
3317 // the max number of outputs written. If it's false, store out the result
3318 // values to ref.
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)3319 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
3320 {
3321 SubgroupState stateStack[10];
3322 deMemset(&stateStack, 0, sizeof(stateStack));
3323
3324 // Per-invocation output location counters
3325 std::vector<uint32_t> outLoc(invocationStride, 0u);
3326
3327 nesting = 0;
3328 loopNesting = 0;
3329
3330 for (uint32_t k = 0; k < invocationStride; ++k)
3331 stateStack[nesting].activeMask.set(k);
3332
3333 int32_t i = 0;
3334 while (i < (int32_t)ops.size())
3335 {
3336 switch (ops[i].type)
3337 {
3338 case OP_BALLOT:
3339 simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3340 break;
3341 case OP_STORE:
3342 simulateStoreToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3343 break;
3344 case OP_IF_MASK:
3345 nesting++;
3346 stateStack[nesting].activeMask =
3347 stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3348 stateStack[nesting].header = i;
3349 stateStack[nesting].isLoop = 0;
3350 stateStack[nesting].isSwitch = 0;
3351 break;
3352 case OP_ELSE_MASK:
3353 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3354 ~bitsetFromU64(ops[stateStack[nesting].header].value, subgroupSize);
3355 break;
3356 case OP_IF_LOOPCOUNT:
3357 {
3358 uint32_t n = nesting;
3359 while (!stateStack[n].isLoop)
3360 n--;
3361
3362 nesting++;
3363 stateStack[nesting].activeMask =
3364 stateStack[nesting - 1].activeMask & bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3365 stateStack[nesting].header = i;
3366 stateStack[nesting].isLoop = 0;
3367 stateStack[nesting].isSwitch = 0;
3368 break;
3369 }
3370 case OP_ELSE_LOOPCOUNT:
3371 {
3372 uint32_t n = nesting;
3373 while (!stateStack[n].isLoop)
3374 n--;
3375
3376 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3377 ~bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3378 break;
3379 }
3380 case OP_IF_LOCAL_INVOCATION_INDEX: // TessCtrlRandomProgram
3381 {
3382 // all bits >= N
3383 bitset_inv_t mask;
3384 for (uint32_t j = static_cast<uint32_t>(ops[i].value); j < invocationStride; ++j)
3385 mask.set(j);
3386
3387 nesting++;
3388 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3389 stateStack[nesting].header = i;
3390 stateStack[nesting].isLoop = 0;
3391 stateStack[nesting].isSwitch = 0;
3392 break;
3393 }
3394 case OP_ELSE_LOCAL_INVOCATION_INDEX: // TessCtrlRandomProgram
3395 {
3396 // all bits < N
3397 bitset_inv_t mask;
3398 for (uint32_t j = 0; j < static_cast<uint32_t>(ops[i].value); ++j)
3399 mask.set(j);
3400
3401 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3402 break;
3403 }
3404 case OP_ENDIF:
3405 nesting--;
3406 break;
3407 case OP_BEGIN_FOR_UNIF:
3408 // XXX TODO: We don't handle a for loop with zero iterations
3409 nesting++;
3410 loopNesting++;
3411 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3412 stateStack[nesting].header = i;
3413 stateStack[nesting].tripCount = 0;
3414 stateStack[nesting].isLoop = 1;
3415 stateStack[nesting].isSwitch = 0;
3416 stateStack[nesting].continueMask = 0;
3417 break;
3418 case OP_END_FOR_UNIF:
3419 stateStack[nesting].tripCount++;
3420 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3421 stateStack[nesting].continueMask = 0;
3422 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3423 stateStack[nesting].activeMask.any())
3424 {
3425 i = stateStack[nesting].header + 1;
3426 continue;
3427 }
3428 else
3429 {
3430 loopNesting--;
3431 nesting--;
3432 }
3433 break;
3434 case OP_BEGIN_DO_WHILE_UNIF:
3435 // XXX TODO: We don't handle a for loop with zero iterations
3436 nesting++;
3437 loopNesting++;
3438 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3439 stateStack[nesting].header = i;
3440 stateStack[nesting].tripCount = 1;
3441 stateStack[nesting].isLoop = 1;
3442 stateStack[nesting].isSwitch = 0;
3443 stateStack[nesting].continueMask = 0;
3444 break;
3445 case OP_END_DO_WHILE_UNIF:
3446 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3447 stateStack[nesting].continueMask = 0;
3448 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3449 stateStack[nesting].activeMask.any())
3450 {
3451 i = stateStack[nesting].header + 1;
3452 stateStack[nesting].tripCount++;
3453 continue;
3454 }
3455 else
3456 {
3457 loopNesting--;
3458 nesting--;
3459 }
3460 break;
3461 case OP_BEGIN_FOR_VAR:
3462 // XXX TODO: We don't handle a for loop with zero iterations
3463 nesting++;
3464 loopNesting++;
3465 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3466 stateStack[nesting].header = i;
3467 stateStack[nesting].tripCount = 0;
3468 stateStack[nesting].isLoop = 1;
3469 stateStack[nesting].isSwitch = 0;
3470 stateStack[nesting].continueMask = 0;
3471 break;
3472 case OP_END_FOR_VAR:
3473 stateStack[nesting].tripCount++;
3474 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3475 stateStack[nesting].continueMask = 0;
3476 stateStack[nesting].activeMask &= bitsetFromU64(stateStack[nesting].tripCount == subgroupSize ?
3477 0 :
3478 ~((1ULL << (stateStack[nesting].tripCount)) - 1),
3479 subgroupSize);
3480 if (stateStack[nesting].activeMask.any())
3481 {
3482 i = stateStack[nesting].header + 1;
3483 continue;
3484 }
3485 else
3486 {
3487 loopNesting--;
3488 nesting--;
3489 }
3490 break;
3491 case OP_BEGIN_FOR_INF:
3492 case OP_BEGIN_DO_WHILE_INF:
3493 nesting++;
3494 loopNesting++;
3495 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3496 stateStack[nesting].header = i;
3497 stateStack[nesting].tripCount = 0;
3498 stateStack[nesting].isLoop = 1;
3499 stateStack[nesting].isSwitch = 0;
3500 stateStack[nesting].continueMask = 0;
3501 break;
3502 case OP_END_FOR_INF:
3503 stateStack[nesting].tripCount++;
3504 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3505 stateStack[nesting].continueMask = 0;
3506 if (stateStack[nesting].activeMask.any())
3507 {
3508 // output expected OP_BALLOT values
3509 simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3510
3511 i = stateStack[nesting].header + 1;
3512 continue;
3513 }
3514 else
3515 {
3516 loopNesting--;
3517 nesting--;
3518 }
3519 break;
3520 case OP_END_DO_WHILE_INF:
3521 stateStack[nesting].tripCount++;
3522 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3523 stateStack[nesting].continueMask = 0;
3524 if (stateStack[nesting].activeMask.any())
3525 {
3526 i = stateStack[nesting].header + 1;
3527 continue;
3528 }
3529 else
3530 {
3531 loopNesting--;
3532 nesting--;
3533 }
3534 break;
3535 case OP_BREAK:
3536 {
3537 uint32_t n = nesting;
3538 bitset_inv_t mask = stateStack[nesting].activeMask;
3539 while (true)
3540 {
3541 stateStack[n].activeMask &= ~mask;
3542 if (stateStack[n].isLoop || stateStack[n].isSwitch)
3543 break;
3544
3545 n--;
3546 }
3547 }
3548 break;
3549 case OP_CONTINUE:
3550 {
3551 uint32_t n = nesting;
3552 bitset_inv_t mask = stateStack[nesting].activeMask;
3553 while (true)
3554 {
3555 stateStack[n].activeMask &= ~mask;
3556 if (stateStack[n].isLoop)
3557 {
3558 stateStack[n].continueMask |= mask;
3559 break;
3560 }
3561 n--;
3562 }
3563 }
3564 break;
3565 case OP_ELECT:
3566 {
3567 nesting++;
3568 stateStack[nesting].activeMask = bitsetElect(stateStack[nesting - 1].activeMask, subgroupSize);
3569 stateStack[nesting].header = i;
3570 stateStack[nesting].isLoop = 0;
3571 stateStack[nesting].isSwitch = 0;
3572 }
3573 break;
3574 case OP_RETURN:
3575 {
3576 bitset_inv_t mask = stateStack[nesting].activeMask;
3577 for (int32_t n = nesting; n >= 0; --n)
3578 {
3579 stateStack[n].activeMask &= ~mask;
3580 if (stateStack[n].isCall)
3581 break;
3582 }
3583 }
3584 break;
3585
3586 case OP_CALL_BEGIN:
3587 nesting++;
3588 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3589 stateStack[nesting].isLoop = 0;
3590 stateStack[nesting].isSwitch = 0;
3591 stateStack[nesting].isCall = 1;
3592 break;
3593 case OP_CALL_END:
3594 stateStack[nesting].isCall = 0;
3595 nesting--;
3596 break;
3597 case OP_NOISE:
3598 break;
3599
3600 case OP_SWITCH_UNIF_BEGIN:
3601 case OP_SWITCH_VAR_BEGIN:
3602 case OP_SWITCH_LOOP_COUNT_BEGIN:
3603 nesting++;
3604 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3605 stateStack[nesting].header = i;
3606 stateStack[nesting].isLoop = 0;
3607 stateStack[nesting].isSwitch = 1;
3608 break;
3609 case OP_SWITCH_END:
3610 nesting--;
3611 break;
3612 case OP_CASE_MASK_BEGIN:
3613 stateStack[nesting].activeMask =
3614 stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3615 break;
3616 case OP_CASE_LOOP_COUNT_BEGIN:
3617 {
3618 uint32_t n = nesting;
3619 uint32_t l = loopNesting;
3620
3621 while (true)
3622 {
3623 if (stateStack[n].isLoop)
3624 {
3625 l--;
3626 if (l == ops[stateStack[nesting].header].value)
3627 break;
3628 }
3629 n--;
3630 }
3631
3632 if ((1ULL << stateStack[n].tripCount) & ops[i].value)
3633 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3634 else
3635 stateStack[nesting].activeMask = 0;
3636 break;
3637 }
3638 case OP_CASE_END:
3639 break;
3640
3641 default:
3642 DE_ASSERT(0);
3643 break;
3644 }
3645 i++;
3646 }
3647 uint32_t maxLoc = 0;
3648 for (uint32_t id = 0; id < (uint32_t)outLoc.size(); ++id)
3649 maxLoc = de::max(maxLoc, outLoc[id]);
3650
3651 return maxLoc;
3652 }
3653 };
3654
3655 class TessEvalRandomProgram : public RandomProgram
3656 {
3657 public:
TessEvalRandomProgram(add_cref<CaseDef> c,uint32_t invocationCount=0)3658 TessEvalRandomProgram(add_cref<CaseDef> c, uint32_t invocationCount = 0)
3659 : RandomProgram(c, (invocationCount ? invocationCount : 64))
3660 , ifLocalInvocationIndexAsSubgroupInvocationID(false)
3661 {
3662 DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT);
3663 }
3664 virtual ~TessEvalRandomProgram() = default;
3665
3666 const bool ifLocalInvocationIndexAsSubgroupInvocationID;
3667 static const uint32_t quadInvocationCount = 4;
3668
3669 // Simulate execution of the program. If countOnly is true, just return
3670 // the max number of outputs written. If it's false, store out the result
3671 // values to ref.
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)3672 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
3673 {
3674 SubgroupState stateStack[10];
3675 deMemset(&stateStack, 0, sizeof(stateStack));
3676
3677 // Per-invocation output location counters
3678 std::vector<uint32_t> outLoc(invocationStride, 0u);
3679
3680 nesting = 0;
3681 loopNesting = 0;
3682
3683 for (uint32_t k = 0; k < invocationStride; ++k)
3684 stateStack[nesting].activeMask.set(k);
3685
3686 int32_t i = 0;
3687 while (i < (int32_t)ops.size())
3688 {
3689 switch (ops[i].type)
3690 {
3691 case OP_BALLOT:
3692 simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3693 break;
3694 case OP_STORE:
3695 simulateStoreToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3696 break;
3697 case OP_IF_MASK:
3698 nesting++;
3699 stateStack[nesting].activeMask =
3700 stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3701 stateStack[nesting].header = i;
3702 stateStack[nesting].isLoop = 0;
3703 stateStack[nesting].isSwitch = 0;
3704 break;
3705 case OP_ELSE_MASK:
3706 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3707 ~bitsetFromU64(ops[stateStack[nesting].header].value, subgroupSize);
3708 break;
3709 case OP_IF_LOOPCOUNT:
3710 {
3711 uint32_t n = nesting;
3712 while (!stateStack[n].isLoop)
3713 n--;
3714
3715 nesting++;
3716 stateStack[nesting].activeMask =
3717 stateStack[nesting - 1].activeMask & bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3718 stateStack[nesting].header = i;
3719 stateStack[nesting].isLoop = 0;
3720 stateStack[nesting].isSwitch = 0;
3721 break;
3722 }
3723 case OP_ELSE_LOOPCOUNT:
3724 {
3725 uint32_t n = nesting;
3726 while (!stateStack[n].isLoop)
3727 n--;
3728
3729 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask &
3730 ~bitsetFromU64((1ULL << stateStack[n].tripCount), subgroupSize);
3731 break;
3732 }
3733 case OP_IF_LOCAL_INVOCATION_INDEX: // TessEvalRandomProgram
3734 {
3735 bitset_inv_t mask;
3736 if (ifLocalInvocationIndexAsSubgroupInvocationID)
3737 {
3738 // if (gl_SubgroupInvocationID >= value), all bits >= N
3739 for (uint32_t j = static_cast<uint32_t>(ops[i].value); j < subgroupSize; ++j)
3740 mask.set(j);
3741 mask = bitsetFromU64(mask.to_ullong(), subgroupSize);
3742 }
3743 else
3744 {
3745 // all bits >= N
3746 for (uint32_t j = (uint32_t)ops[i].value; j < invocationStride; ++j)
3747 mask.set(j);
3748 }
3749
3750 nesting++;
3751 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3752 stateStack[nesting].header = i;
3753 stateStack[nesting].isLoop = 0;
3754 stateStack[nesting].isSwitch = 0;
3755 break;
3756 }
3757 case OP_ELSE_LOCAL_INVOCATION_INDEX: // TessEvalRandomProgram
3758 {
3759 // all bits < N
3760 bitset_inv_t mask;
3761 for (uint32_t j = 0; j < static_cast<uint32_t>(ops[i].value); ++j)
3762 mask.set(j);
3763
3764 if (ifLocalInvocationIndexAsSubgroupInvocationID)
3765 {
3766 // else (gl_SubgroupInvocationID >= value), all bits < N
3767 mask = bitsetFromU64(mask.to_ullong(), subgroupSize);
3768 }
3769
3770 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask & mask;
3771 break;
3772 }
3773 case OP_ENDIF:
3774 nesting--;
3775 break;
3776 case OP_BEGIN_FOR_UNIF:
3777 // XXX TODO: We don't handle a for loop with zero iterations
3778 nesting++;
3779 loopNesting++;
3780 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3781 stateStack[nesting].header = i;
3782 stateStack[nesting].tripCount = 0;
3783 stateStack[nesting].isLoop = 1;
3784 stateStack[nesting].isSwitch = 0;
3785 stateStack[nesting].continueMask = 0;
3786 break;
3787 case OP_END_FOR_UNIF:
3788 stateStack[nesting].tripCount++;
3789 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3790 stateStack[nesting].continueMask = 0;
3791 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3792 stateStack[nesting].activeMask.any())
3793 {
3794 i = stateStack[nesting].header + 1;
3795 continue;
3796 }
3797 else
3798 {
3799 loopNesting--;
3800 nesting--;
3801 }
3802 break;
3803 case OP_BEGIN_DO_WHILE_UNIF:
3804 // XXX TODO: We don't handle a for loop with zero iterations
3805 nesting++;
3806 loopNesting++;
3807 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3808 stateStack[nesting].header = i;
3809 stateStack[nesting].tripCount = 1;
3810 stateStack[nesting].isLoop = 1;
3811 stateStack[nesting].isSwitch = 0;
3812 stateStack[nesting].continueMask = 0;
3813 break;
3814 case OP_END_DO_WHILE_UNIF:
3815 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3816 stateStack[nesting].continueMask = 0;
3817 if (stateStack[nesting].tripCount < ops[stateStack[nesting].header].value &&
3818 stateStack[nesting].activeMask.any())
3819 {
3820 i = stateStack[nesting].header + 1;
3821 stateStack[nesting].tripCount++;
3822 continue;
3823 }
3824 else
3825 {
3826 loopNesting--;
3827 nesting--;
3828 }
3829 break;
3830 case OP_BEGIN_FOR_VAR:
3831 // XXX TODO: We don't handle a for loop with zero iterations
3832 nesting++;
3833 loopNesting++;
3834 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3835 stateStack[nesting].header = i;
3836 stateStack[nesting].tripCount = 0;
3837 stateStack[nesting].isLoop = 1;
3838 stateStack[nesting].isSwitch = 0;
3839 stateStack[nesting].continueMask = 0;
3840 break;
3841 case OP_END_FOR_VAR:
3842 stateStack[nesting].tripCount++;
3843 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3844 stateStack[nesting].continueMask = 0;
3845 stateStack[nesting].activeMask &= bitsetFromU64(stateStack[nesting].tripCount == subgroupSize ?
3846 0 :
3847 ~((1ULL << (stateStack[nesting].tripCount)) - 1),
3848 subgroupSize);
3849 if (stateStack[nesting].activeMask.any())
3850 {
3851 i = stateStack[nesting].header + 1;
3852 continue;
3853 }
3854 else
3855 {
3856 loopNesting--;
3857 nesting--;
3858 }
3859 break;
3860 case OP_BEGIN_FOR_INF:
3861 case OP_BEGIN_DO_WHILE_INF:
3862 nesting++;
3863 loopNesting++;
3864 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3865 stateStack[nesting].header = i;
3866 stateStack[nesting].tripCount = 0;
3867 stateStack[nesting].isLoop = 1;
3868 stateStack[nesting].isSwitch = 0;
3869 stateStack[nesting].continueMask = 0;
3870 break;
3871 case OP_END_FOR_INF:
3872 stateStack[nesting].tripCount++;
3873 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3874 stateStack[nesting].continueMask = 0;
3875 if (stateStack[nesting].activeMask.any())
3876 {
3877 // output expected OP_BALLOT values
3878 simulateBallotToChange(countOnly, subgroupSize, stateStack, i, outLoc, ref);
3879
3880 i = stateStack[nesting].header + 1;
3881 continue;
3882 }
3883 else
3884 {
3885 loopNesting--;
3886 nesting--;
3887 }
3888 break;
3889 case OP_END_DO_WHILE_INF:
3890 stateStack[nesting].tripCount++;
3891 stateStack[nesting].activeMask |= stateStack[nesting].continueMask;
3892 stateStack[nesting].continueMask = 0;
3893 if (stateStack[nesting].activeMask.any())
3894 {
3895 i = stateStack[nesting].header + 1;
3896 continue;
3897 }
3898 else
3899 {
3900 loopNesting--;
3901 nesting--;
3902 }
3903 break;
3904 case OP_BREAK:
3905 {
3906 uint32_t n = nesting;
3907 bitset_inv_t mask = stateStack[nesting].activeMask;
3908 while (true)
3909 {
3910 stateStack[n].activeMask &= ~mask;
3911 if (stateStack[n].isLoop || stateStack[n].isSwitch)
3912 break;
3913
3914 n--;
3915 }
3916 }
3917 break;
3918 case OP_CONTINUE:
3919 {
3920 uint32_t n = nesting;
3921 bitset_inv_t mask = stateStack[nesting].activeMask;
3922 while (true)
3923 {
3924 stateStack[n].activeMask &= ~mask;
3925 if (stateStack[n].isLoop)
3926 {
3927 stateStack[n].continueMask |= mask;
3928 break;
3929 }
3930 n--;
3931 }
3932 }
3933 break;
3934 case OP_ELECT:
3935 {
3936 nesting++;
3937 stateStack[nesting].activeMask = bitsetElect(stateStack[nesting - 1].activeMask, subgroupSize);
3938 stateStack[nesting].header = i;
3939 stateStack[nesting].isLoop = 0;
3940 stateStack[nesting].isSwitch = 0;
3941 }
3942 break;
3943 case OP_RETURN:
3944 {
3945 bitset_inv_t mask = stateStack[nesting].activeMask;
3946 for (int32_t n = nesting; n >= 0; --n)
3947 {
3948 stateStack[n].activeMask &= ~mask;
3949 if (stateStack[n].isCall)
3950 break;
3951 }
3952 }
3953 break;
3954
3955 case OP_CALL_BEGIN:
3956 nesting++;
3957 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3958 stateStack[nesting].isLoop = 0;
3959 stateStack[nesting].isSwitch = 0;
3960 stateStack[nesting].isCall = 1;
3961 break;
3962 case OP_CALL_END:
3963 stateStack[nesting].isCall = 0;
3964 nesting--;
3965 break;
3966 case OP_NOISE:
3967 break;
3968
3969 case OP_SWITCH_UNIF_BEGIN:
3970 case OP_SWITCH_VAR_BEGIN:
3971 case OP_SWITCH_LOOP_COUNT_BEGIN:
3972 nesting++;
3973 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
3974 stateStack[nesting].header = i;
3975 stateStack[nesting].isLoop = 0;
3976 stateStack[nesting].isSwitch = 1;
3977 break;
3978 case OP_SWITCH_END:
3979 nesting--;
3980 break;
3981 case OP_CASE_MASK_BEGIN:
3982 stateStack[nesting].activeMask =
3983 stateStack[nesting - 1].activeMask & bitsetFromU64(ops[i].value, subgroupSize);
3984 break;
3985 case OP_CASE_LOOP_COUNT_BEGIN:
3986 {
3987 uint32_t n = nesting;
3988 uint32_t l = loopNesting;
3989
3990 while (true)
3991 {
3992 if (stateStack[n].isLoop)
3993 {
3994 l--;
3995 if (l == ops[stateStack[nesting].header].value)
3996 break;
3997 }
3998 n--;
3999 }
4000
4001 if ((1ULL << stateStack[n].tripCount) & ops[i].value)
4002 stateStack[nesting].activeMask = stateStack[nesting - 1].activeMask;
4003 else
4004 stateStack[nesting].activeMask = 0;
4005 break;
4006 }
4007 case OP_CASE_END:
4008 break;
4009
4010 default:
4011 DE_ASSERT(0);
4012 break;
4013 }
4014 i++;
4015 }
4016 uint32_t maxLoc = 0;
4017 for (uint32_t id = 0; id < (uint32_t)outLoc.size(); ++id)
4018 maxLoc = de::max(maxLoc, outLoc[id]);
4019
4020 return maxLoc;
4021 }
4022
4023 protected:
genIf(IFType ifType,uint32_t)4024 virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
4025 {
4026 RandomProgram::genIf(ifType, std::min(64u, (caseDef.sizeX * quadInvocationCount - 1)));
4027 }
4028
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)4029 virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4030 {
4031 // uint invocationIndex() { return gl_PrimitiveID * width + gl_SubgroupInvocationID; }
4032 printIndent(css);
4033 css << "if (";
4034 if (ifLocalInvocationIndexAsSubgroupInvocationID)
4035 css << "gl_SubgroupInvocationID";
4036 else
4037 css << "((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) + gl_SubgroupInvocationID)";
4038 css << " >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
4039 }
4040
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)4041 virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4042 {
4043 printIndent(css);
4044 css << "outputC.loc[invocationIndex()]++;\n";
4045 printIndent(css);
4046 css << "outputB.b[(outLoc++)*invocationStride + invocationIndex()].x = 0x" << std::hex
4047 << flow.ops[flow.opsIndex].value << ";\n";
4048 }
4049
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)4050 virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
4051 bool endWithSemicolon = false) override
4052 {
4053 printIndent(css);
4054
4055 css << "outputC.loc[invocationIndex()]++,";
4056 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
4057 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
4058 // subgroup_uniform_control_flow, since we only validate results that must be fully
4059 // reconverged.
4060 if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
4061 {
4062 css << "outputB.b[(outLoc++)*invocationStride + invocationIndex()] = " << getPartitionBallotText() << ".xy";
4063 }
4064 else
4065 {
4066 css << "outputB.b[(outLoc++)*invocationStride + invocationIndex()] = subgroupBallot(true).xy";
4067 }
4068 if (endWithSemicolon)
4069 {
4070 css << ";\n";
4071 }
4072 }
4073
simulateStoreToChange(bool countOnly,uint32_t,const SubgroupState (& stateStack)[10],int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)4074 void simulateStoreToChange(bool countOnly, uint32_t /*subgroupSize*/, const SubgroupState (&stateStack)[10],
4075 int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
4076 add_ref<std::vector<uint64_t>> ref)
4077 {
4078 for (uint32_t id = 0; id < invocationStride; ++id)
4079 {
4080 if (stateStack[nesting].activeMask.test(id))
4081 {
4082 if (countOnly)
4083 outLoc[id]++;
4084 else
4085 ref[(outLoc[id]++) * invocationStride + id] = ops[opsIndex].value;
4086 }
4087 }
4088 }
4089
simulateBallotToChange(bool countOnly,uint32_t subgroupSize,const SubgroupState (& stateStack)[10],uint32_t,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<uint64_t>> ref)4090 void simulateBallotToChange(bool countOnly, uint32_t subgroupSize, const SubgroupState (&stateStack)[10],
4091 uint32_t /*opsIndex*/, add_ref<std::vector<uint32_t>> outLoc,
4092 add_ref<std::vector<uint64_t>> ref)
4093 {
4094 for (uint32_t id = 0; id < invocationStride; ++id)
4095 {
4096 if (stateStack[nesting].activeMask.test(id))
4097 {
4098 if (countOnly)
4099 outLoc[id]++;
4100 else
4101 ref[(outLoc[id]++) * invocationStride + id] =
4102 bitsetToU64(stateStack[nesting].activeMask, subgroupSize, id);
4103 }
4104 }
4105 }
4106 };
4107
4108 class GeometryRandomProgram : public RandomProgram
4109 {
4110 public:
4111 static const constexpr uint32_t fillPercentage = 71u;
GeometryRandomProgram(add_cref<CaseDef> c)4112 GeometryRandomProgram(add_cref<CaseDef> c)
4113 : RandomProgram(c, Arrangement::calculatePrimitiveCount(c.sizeX, c.sizeY, fillPercentage))
4114 {
4115 DE_ASSERT(c.shaderStage == VK_SHADER_STAGE_GEOMETRY_BIT);
4116 }
4117 virtual ~GeometryRandomProgram() = default;
4118
4119 struct Arrangement : Prerequisites
4120 {
4121 static constexpr uint32_t NUM_SUBGROUPS_OFFSET = 0u;
4122 static constexpr uint32_t SUBGROUP_SIZE_OFFSET = 1u;
4123 static constexpr uint32_t INVOCATION_COUNT_OFFSET = 2u;
4124 static constexpr uint32_t MAX_LOC_OFFSET = 3u;
4125 static constexpr uint32_t MAX_IDENTITY_OFFSET = 4u;
4126 static constexpr uint32_t INVOCATION_ENTRY_OFFSET = 5u;
4127
4128 const uint32_t m_shaderSubgroupSize;
4129 const uint32_t m_shaderSubgroupCount;
4130 const uint32_t m_shaderInvocationCount;
4131 const uint32_t m_shaderMaxLoc;
4132 const uint32_t m_shaderMaxIdentity;
4133
4134 const uint32_t m_subgroupSize;
4135 const uint32_t m_primitiveStride;
4136 const uint32_t m_invocationStride;
4137 const uint32_t m_subgroupCount;
4138 const Ballots m_initialBallots;
4139 const std::vector<uint32_t> m_primitiveSubgroups;
4140
Arrangementvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4141 Arrangement(add_cref<std::vector<uint32_t>> outputP, uint32_t subgroupSize, uint32_t primitiveStride)
4142 : m_shaderSubgroupSize(outputP.at(SUBGROUP_SIZE_OFFSET))
4143 , m_shaderSubgroupCount(outputP.at(NUM_SUBGROUPS_OFFSET))
4144 , m_shaderInvocationCount(outputP.at(INVOCATION_COUNT_OFFSET))
4145 , m_shaderMaxLoc(outputP.at(MAX_LOC_OFFSET))
4146 , m_shaderMaxIdentity(outputP.at(MAX_IDENTITY_OFFSET))
4147 , m_subgroupSize(subgroupSize)
4148 , m_primitiveStride(primitiveStride)
4149 , m_invocationStride(primitiveStride)
4150 , m_subgroupCount(ROUNDUP(primitiveStride, subgroupSize) / subgroupSize)
4151 , m_initialBallots(makeInitialBallots(outputP))
4152 , m_primitiveSubgroups(makePrimitiveSubgroups(outputP))
4153 {
4154 }
makeInitialBallotsvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4155 static Ballots makeInitialBallots(add_cref<std::vector<uint32_t>> outputP)
4156 {
4157 const uint32_t subgroupCount = outputP.at(NUM_SUBGROUPS_OFFSET);
4158 const uint32_t subgroupSize = outputP.at(SUBGROUP_SIZE_OFFSET);
4159 DE_UNREF(subgroupSize);
4160 const uint32_t primitiveStride = outputP.at(INVOCATION_COUNT_OFFSET);
4161 Ballots b(subgroupCount);
4162 for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
4163 {
4164 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRY_OFFSET);
4165 if (id)
4166 {
4167 const uint32_t subgroupID = (id >> 16) - 1u;
4168 const uint32_t subgroupInvocationID = id & 0xFFFF;
4169 DE_ASSERT(subgroupID < subgroupCount);
4170 DE_ASSERT(subgroupInvocationID < subgroupSize);
4171 b.at(subgroupID).set(subgroupInvocationID);
4172 }
4173 }
4174 return b;
4175 }
makePrimitiveSubgroupsvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4176 static std::vector<uint32_t> makePrimitiveSubgroups(add_cref<std::vector<uint32_t>> outputP)
4177 {
4178 const uint32_t subgroupSize = outputP.at(SUBGROUP_SIZE_OFFSET);
4179 const uint32_t primitiveStride = outputP.at(INVOCATION_COUNT_OFFSET);
4180 std::vector<uint32_t> map(primitiveStride);
4181 for (uint32_t primitiveID = 0u; primitiveID < primitiveStride; ++primitiveID)
4182 {
4183 const uint32_t id = outputP.at(primitiveID + INVOCATION_ENTRY_OFFSET);
4184 if (id)
4185 {
4186 const uint32_t subgroupID = (id >> 16) - 1u;
4187 const uint32_t subgroupInvocationID = id & 0xFFFF;
4188 DE_ASSERT(subgroupInvocationID < subgroupSize);
4189 map.at(primitiveID) = subgroupID * subgroupSize + subgroupInvocationID;
4190 }
4191 }
4192 return map;
4193 }
calculatePrimitiveCountvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4194 static uint32_t calculatePrimitiveCount(uint32_t width, uint32_t height, uint32_t fillPercent)
4195 {
4196 deRandom rnd;
4197 std::map<uint32_t, int> map;
4198 std::vector<tcu::Vec4> points;
4199 const uint32_t frags = (width * height);
4200 const uint32_t total = (frags * fillPercent) / 100u;
4201
4202 deRandom_init(&rnd, (width * height));
4203
4204 for (uint32_t i = 0u; i < total; ++i)
4205 {
4206 const uint32_t r = deRandom_getUint32(&rnd) % frags;
4207 if (map[r] != 0)
4208 {
4209 i -= 1;
4210 continue;
4211 }
4212 map[r] = 1;
4213 }
4214
4215 return static_cast<uint32_t>(map.size());
4216 }
generatePrimitivesvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4217 static std::vector<tcu::Vec4> generatePrimitives(uint32_t width, uint32_t height, uint32_t fillPercent)
4218 {
4219 deRandom rnd;
4220 std::map<uint32_t, int> map;
4221 std::vector<tcu::Vec4> points;
4222 const uint32_t frags = (width * height);
4223 const uint32_t total = (frags * fillPercent) / 100u;
4224
4225 deRandom_init(&rnd, (width * height));
4226
4227 for (uint32_t i = 0u; i < total; ++i)
4228 {
4229 const uint32_t r = deRandom_getUint32(&rnd) % frags;
4230 if (map[r] != 0)
4231 {
4232 i -= 1;
4233 continue;
4234 }
4235 map[r] = 1;
4236
4237 uint32_t y = r / width;
4238 uint32_t x = r % width;
4239 float xx = (float(x) + float(x + 1)) / (2.0f * float(width));
4240 float yy = (float(y) + float(y + 1)) / (2.0f * float(height));
4241 float xxx = xx * 2.0f - 1.0f;
4242 float yyy = yy * 2.0f - 1.0f;
4243 points.emplace_back(tcu::Vec4(xxx, yyy, 0u, 0u));
4244 }
4245 return points;
4246 }
generateVectorOutputPvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4247 static std::vector<uint32_t> generateVectorOutputP(uint32_t subgroupSize, uint32_t primitiveStride)
4248 {
4249 const uint32_t subgroupCount = ROUNDUP(primitiveStride, subgroupSize) / subgroupSize;
4250 std::vector<uint32_t> outputP(primitiveStride + INVOCATION_ENTRY_OFFSET);
4251 outputP.at(NUM_SUBGROUPS_OFFSET) = subgroupCount;
4252 outputP.at(SUBGROUP_SIZE_OFFSET) = subgroupSize;
4253 outputP.at(INVOCATION_COUNT_OFFSET) = primitiveStride;
4254 outputP.at(MAX_LOC_OFFSET) = 0u;
4255 outputP.at(MAX_IDENTITY_OFFSET) = 0u;
4256 for (uint32_t vertexID = 0u; vertexID < primitiveStride; ++vertexID)
4257 {
4258 const uint32_t subgroupID = vertexID / subgroupSize;
4259 const uint32_t subgroupInvocationID = vertexID % subgroupSize;
4260 outputP.at(vertexID + INVOCATION_ENTRY_OFFSET) = ((subgroupID + 1u) << 16) | subgroupInvocationID;
4261 }
4262 return outputP;
4263 }
generateVectorOutputPvkt::Reconvergence::__anon4f2394780111::GeometryRandomProgram::Arrangement4264 static std::vector<uint32_t> generateVectorOutputP(uint32_t subgroupSize, uint32_t width, uint32_t height,
4265 uint32_t percent)
4266 {
4267 const uint32_t primitiveStride = calculatePrimitiveCount(width, height, percent);
4268 return generateVectorOutputP(subgroupSize, primitiveStride);
4269 }
4270 };
4271
simulate(bool countOnly,uint32_t subgroupSize,add_ref<std::vector<uint64_t>> ref)4272 virtual uint32_t simulate(bool countOnly, uint32_t subgroupSize, add_ref<std::vector<uint64_t>> ref) override
4273 {
4274 DE_ASSERT(false); // use overloaded version of simulate() instead
4275 DE_UNREF(countOnly);
4276 DE_UNREF(subgroupSize);
4277 DE_UNREF(ref);
4278 return 0;
4279 }
4280
4281 protected:
genIf(IFType ifType,uint32_t)4282 virtual void genIf(IFType ifType, uint32_t /*maxLocalIndexCmp*/) override
4283 {
4284 RandomProgram::genIf(ifType, RandomProgram::invocationStride);
4285 }
4286
getPartitionBallotText()4287 virtual std::string getPartitionBallotText() override
4288 {
4289 return "storeValue(outLoc++, subgroupBallot(true))";
4290 }
4291
printIfLocalInvocationIndex(add_ref<std::stringstream> css,add_cref<FlowState> flow)4292 virtual void printIfLocalInvocationIndex(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4293 {
4294 printIndent(css);
4295 css << "if (invocationIndex() >= inputA.a[0x" << std::hex << flow.ops[flow.opsIndex].value << "]) {\n";
4296 }
4297
printStore(add_ref<std::stringstream> css,add_cref<FlowState> flow)4298 virtual void printStore(add_ref<std::stringstream> css, add_cref<FlowState> flow) override
4299 {
4300 printIndent(css);
4301 css << "storeValue(outLoc++, 0x" << std::hex << flow.ops[flow.opsIndex].value << std::dec << ");\n";
4302 }
4303
printBallot(add_ref<std::stringstream> css,add_cref<FlowState>,bool endWithSemicolon=false)4304 virtual void printBallot(add_ref<std::stringstream> css, add_cref<FlowState>,
4305 bool endWithSemicolon = false) override
4306 {
4307 printIndent(css);
4308 // When inside loop(s), use partitionBallot rather than subgroupBallot to compute
4309 // a ballot, to make sure the ballot is "diverged enough". Don't do this for
4310 // subgroup_uniform_control_flow, since we only validate results that must be fully
4311 // reconverged.
4312 if (loopNesting > 0 && caseDef.testType == TT_MAXIMAL)
4313 {
4314 css << getPartitionBallotText();
4315 }
4316 else
4317 {
4318 css << "storeValue(outLoc++, subgroupBallot(true))";
4319 }
4320 if (endWithSemicolon)
4321 {
4322 css << ";\n";
4323 }
4324 }
4325
simulateBallot(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const int32_t opsIndex,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)4326 virtual void simulateBallot(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
4327 const int32_t opsIndex, add_ref<std::vector<uint32_t>> outLoc,
4328 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
4329 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
4330 const OPType reason, const tcu::UVec4 *cmp) override
4331 {
4332 DE_UNREF(unusedPrimitiveID);
4333 DE_UNREF(opsIndex);
4334 add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
4335 for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
4336 {
4337 const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
4338 DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
4339 if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
4340 continue;
4341 const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
4342 if (false == countOnly)
4343 {
4344 ref.at(index) = Ballot(activeMask.at(sgid / a.m_subgroupSize));
4345 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
4346 {
4347 logFailureCount -= 1u;
4348 log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
4349 << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
4350 }
4351 }
4352 }
4353 }
4354
simulateStore(const bool countOnly,add_cref<Ballots> activeMask,const uint32_t unusedPrimitiveID,const uint64_t storeValue,add_ref<std::vector<uint32_t>> outLoc,add_ref<std::vector<tcu::UVec4>> ref,add_ref<tcu::TestLog> log,std::shared_ptr<Prerequisites> prerequisites,add_ref<uint32_t> logFailureCount,const OPType reason,const tcu::UVec4 * cmp)4355 virtual void simulateStore(const bool countOnly, add_cref<Ballots> activeMask, const uint32_t unusedPrimitiveID,
4356 const uint64_t storeValue, add_ref<std::vector<uint32_t>> outLoc,
4357 add_ref<std::vector<tcu::UVec4>> ref, add_ref<tcu::TestLog> log,
4358 std::shared_ptr<Prerequisites> prerequisites, add_ref<uint32_t> logFailureCount,
4359 const OPType reason, const tcu::UVec4 *cmp) override
4360 {
4361 DE_UNREF(unusedPrimitiveID);
4362 add_cref<Arrangement> a(*std::static_pointer_cast<Arrangement>(prerequisites));
4363 for (uint32_t primitiveID = 0u; primitiveID < a.m_primitiveStride; ++primitiveID)
4364 {
4365 const uint32_t sgid = a.m_primitiveSubgroups.at(primitiveID);
4366 DE_ASSERT(sgid < (a.m_subgroupCount * a.m_subgroupSize));
4367 if (false == activeMask.test(Ballots::findBit(sgid, a.m_subgroupSize)))
4368 continue;
4369 const uint32_t index = (outLoc.at(primitiveID)++) * a.m_invocationStride + primitiveID;
4370 if (false == countOnly)
4371 {
4372 ref.at(index) = Ballot(tcu::UVec4(uint32_t(storeValue & 0xFFFFFFFF), 0u, 0u, 0u));
4373 if (cmp && logFailureCount > 0u && cmp[index] != ref.at(index))
4374 {
4375 logFailureCount -= 1u;
4376 log << tcu::TestLog::Message << logFailureCount << ": stored value mismatch from "
4377 << OPtypeToStr(reason) << tcu::TestLog::EndMessage;
4378 }
4379 }
4380 }
4381 }
4382
makePrerequisites(add_cref<std::vector<uint32_t>> outputP,const uint32_t subgroupSize,const uint32_t fragmentStride,const uint32_t primitiveStride,add_ref<std::vector<SubgroupState2>> stateStack,add_ref<std::vector<uint32_t>> outLoc,add_ref<uint32_t> subgroupCount)4383 virtual std::shared_ptr<Prerequisites> makePrerequisites(add_cref<std::vector<uint32_t>> outputP,
4384 const uint32_t subgroupSize, const uint32_t fragmentStride,
4385 const uint32_t primitiveStride,
4386 add_ref<std::vector<SubgroupState2>> stateStack,
4387 add_ref<std::vector<uint32_t>> outLoc,
4388 add_ref<uint32_t> subgroupCount) override
4389 {
4390 DE_UNREF(fragmentStride);
4391 auto prerequisites = std::make_shared<Arrangement>(outputP, subgroupSize, primitiveStride);
4392 subgroupCount = prerequisites->m_subgroupCount;
4393 stateStack.resize(10u, SubgroupState2(subgroupCount));
4394 outLoc.resize(primitiveStride, 0u);
4395 stateStack.at(0).activeMask = prerequisites->m_initialBallots;
4396 return prerequisites;
4397 }
4398 };
4399
4400 class ReconvergenceTestCase : public TestCase
4401 {
4402 public:
ReconvergenceTestCase(tcu::TestContext & context,const std::string & name,const CaseDef data)4403 ReconvergenceTestCase(tcu::TestContext &context, const std::string &name, const CaseDef data)
4404 : TestCase(context, name)
4405 , m_data(data)
4406 , m_program()
4407 , m_subgroupSizeToMaxLoc()
4408 {
4409 }
4410 ~ReconvergenceTestCase(void) = default;
4411 virtual void delayedInit(void) override;
4412 virtual void checkSupport(Context &context) const override;
4413 virtual void initPrograms(SourceCollections &programCollection) const override;
4414 virtual TestInstance *createInstance(Context &context) const override;
4415 de::MovePtr<RandomProgram> selectProgram() const;
4416
4417 private:
4418 CaseDef m_data;
4419 std::shared_ptr<RandomProgram> m_program;
4420 mutable std::map<uint32_t, uint32_t> m_subgroupSizeToMaxLoc;
4421 };
4422
checkSupport(Context & context) const4423 void ReconvergenceTestCase::checkSupport(Context &context) const
4424 {
4425 if (!context.contextSupports(vk::ApiVersion(0u, 1u, 1u, 0u)))
4426 TCU_THROW(NotSupportedError, "Vulkan 1.1 not supported");
4427
4428 const auto properties = getSubgroupProperties(context);
4429 const vk::VkPhysicalDeviceSubgroupProperties &subgroupProperties = properties.first;
4430 const VkPhysicalDeviceLimits &limits = properties.second.properties.limits;
4431
4432 if (m_data.isElect() && !(subgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BASIC_BIT))
4433 TCU_THROW(NotSupportedError, "VK_SUBGROUP_FEATURE_BASIC_BIT not supported");
4434
4435 if (!m_data.isElect() && !(subgroupProperties.supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT))
4436 TCU_THROW(NotSupportedError, "VK_SUBGROUP_FEATURE_BALLOT_BIT not supported");
4437
4438 if (m_data.shaderStage == VK_SHADER_STAGE_COMPUTE_BIT)
4439 {
4440 if ((m_data.sizeX > limits.maxComputeWorkGroupSize[0]) || (m_data.sizeY > limits.maxComputeWorkGroupSize[1]) ||
4441 ((m_data.sizeX * m_data.sizeY) > limits.maxComputeWorkGroupInvocations))
4442 {
4443 TCU_THROW(NotSupportedError, "compute workgroup count exceeds device limit");
4444 }
4445 }
4446
4447 if (!(subgroupProperties.supportedStages & m_data.shaderStage))
4448 {
4449 std::stringstream ss;
4450 ss << getShaderStageFlagsStr(m_data.shaderStage);
4451 ss << " does not support subgroup operations";
4452 ss.flush();
4453 TCU_THROW(NotSupportedError, ss.str());
4454 }
4455
4456 // Both subgroup- AND workgroup-uniform tests are enabled by shaderSubgroupUniformControlFlow.
4457 if (m_data.isUCF() && !context.getShaderSubgroupUniformControlFlowFeatures().shaderSubgroupUniformControlFlow)
4458 TCU_THROW(NotSupportedError, "shaderSubgroupUniformControlFlow not supported");
4459
4460 if (m_data.testType == TT_MAXIMAL && !context.getShaderMaximalReconvergenceFeatures().shaderMaximalReconvergence)
4461 TCU_THROW(NotSupportedError, "shaderMaximalReconvergence not supported");
4462 }
4463
selectProgram() const4464 de::MovePtr<RandomProgram> ReconvergenceTestCase::selectProgram() const
4465 {
4466 RandomProgram *programPtr(nullptr);
4467 switch (m_data.shaderStage)
4468 {
4469 case VK_SHADER_STAGE_COMPUTE_BIT:
4470 programPtr = new ComputeRandomProgram(m_data);
4471 break;
4472 case VK_SHADER_STAGE_FRAGMENT_BIT:
4473 programPtr = new FragmentRandomProgram(m_data);
4474 break;
4475 case VK_SHADER_STAGE_VERTEX_BIT:
4476 programPtr = new VertexRandomProgram(m_data);
4477 break;
4478 case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
4479 programPtr = new TessCtrlRandomProgram(m_data, 0);
4480 break;
4481 case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
4482 programPtr = new TessEvalRandomProgram(m_data);
4483 break;
4484 case VK_SHADER_STAGE_GEOMETRY_BIT:
4485 programPtr = new GeometryRandomProgram(m_data);
4486 break;
4487 default:
4488 DE_ASSERT(0);
4489 }
4490 DE_ASSERT(programPtr);
4491 return de::MovePtr<RandomProgram>(programPtr);
4492 }
4493
genPassThroughFragmentSource()4494 std::string genPassThroughFragmentSource()
4495 {
4496 std::stringstream str;
4497 str << "#version 450 core\n";
4498 str << "layout(location = 0) out vec4 color;\n";
4499 str << "void main() {\n";
4500 str << " color = vec4(1.0);\n";
4501 str << "}\n";
4502 str.flush();
4503 return str.str();
4504 }
4505
genPassThroughVertexSource()4506 std::string genPassThroughVertexSource()
4507 {
4508 std::stringstream str;
4509 str << "#version 450 core\n";
4510 str << "layout(location = 0) in vec4 pos;\n";
4511 str << "void main() {\n";
4512 str << " gl_Position = vec4(pos.xy, 0.0, 1.0);\n";
4513 str << "}\n";
4514 str.flush();
4515 return str.str();
4516 }
4517
genPassThroughTessCtrlSource()4518 std::string genPassThroughTessCtrlSource()
4519 {
4520 std::stringstream str;
4521 str << "#version 450 core\n";
4522 str << "#extension GL_EXT_tessellation_shader : require\n";
4523 str << "layout(vertices = 3) out;\n";
4524 str << "void main() {\n";
4525 str << " gl_out[gl_InvocationID].gl_Position = gl_in[gl_InvocationID].gl_Position;\n";
4526 str << " gl_TessLevelOuter[0] = 1.0;\n";
4527 str << " gl_TessLevelOuter[1] = 1.0;\n";
4528 str << " gl_TessLevelOuter[2] = 1.0;\n";
4529 str << " gl_TessLevelOuter[3] = 1.0;\n";
4530 str << " gl_TessLevelInner[0] = 1.0;\n";
4531 str << " gl_TessLevelInner[1] = 1.0;\n";
4532 str << "}\n";
4533 str.flush();
4534 return str.str();
4535 }
4536
genPassThroughTessEvalSource()4537 std::string genPassThroughTessEvalSource()
4538 {
4539 std::stringstream str;
4540 str << "#version 450 core\n";
4541 str << "#extension GL_EXT_tessellation_shader : require\n";
4542 str << "layout(equal_spacing, triangles) in;\n";
4543 str << "void main() {\n";
4544 str << " float u = gl_TessCoord.x;\n";
4545 str << " float v = gl_TessCoord.y;\n";
4546 str << " float w = gl_TessCoord.z;\n";
4547 str << " vec4 p0 = vec4(gl_in[0].gl_Position.xy, 0.0, 1.0);\n";
4548 str << " vec4 p1 = vec4(gl_in[1].gl_Position.xy, 0.0, 1.0);\n";
4549 str << " vec4 p2 = vec4(gl_in[2].gl_Position.xy, 0.0, 1.0);\n";
4550 str << " gl_Position = u * p0 + v * p1 + w * p2;\n";
4551 str << "}\n";
4552 str.flush();
4553 return str.str();
4554 }
4555
delayedInit(void)4556 void ReconvergenceTestCase::delayedInit(void)
4557 {
4558 m_program = std::shared_ptr<RandomProgram>(selectProgram().release());
4559 }
4560
initPrograms(SourceCollections & programCollection) const4561 void ReconvergenceTestCase::initPrograms(SourceCollections &programCollection) const
4562 {
4563 de::MovePtr<RandomProgram> program = selectProgram();
4564
4565 m_subgroupSizeToMaxLoc = program->generateRandomProgram(m_testCtx.getWatchDog(), m_testCtx.getLog());
4566
4567 std::stringstream header, layout, globals, prologue, epilogue, aux;
4568
4569 header << "#version 450 core\n";
4570 header << "#extension GL_KHR_shader_subgroup_ballot : enable\n";
4571 header << "#extension GL_KHR_shader_subgroup_vote : enable\n";
4572 header << "#extension GL_NV_shader_subgroup_partitioned : enable\n";
4573 header << "#extension GL_EXT_subgroup_uniform_control_flow : enable\n";
4574 if (m_data.testType == TT_MAXIMAL)
4575 {
4576 header << "#extension GL_EXT_maximal_reconvergence : require\n";
4577 }
4578 switch (m_data.shaderStage)
4579 {
4580 case VK_SHADER_STAGE_COMPUTE_BIT:
4581 layout << "layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z = 1) in;\n";
4582 layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4583 layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4584 layout << "layout(set=0, binding=0) coherent buffer InputA { uint a[]; } inputA;\n";
4585 break;
4586 case VK_SHADER_STAGE_FRAGMENT_BIT:
4587 layout << "// NOTE: A fragment can belong to more than one primitive, and the shader processes each\n";
4588 layout << "// fragment primitive by primitive, so the number of invocation does not have to be\n";
4589 layout << "// equal to the number of fragments of the rendering area. Another important thing\n";
4590 layout << "// is that the Implementation is free to change the order of draving primitives\n";
4591 layout << "// between subsequent application calls.\n";
4592
4593 layout << "// inputA.a[ invocationStride ] = { 0, 1, ..., (invocationStride - 1) }\n";
4594 layout << "layout(set=0, binding=0) coherent buffer InputA { uint a[]; } inputA;\n";
4595
4596 layout << "// outputB.b[ max(loc[]) * invocationStride * primitiveStride ]\n";
4597 layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4598
4599 layout << "// outputC.c[invocationStride * primitiveStride ], incremented per primitive\n";
4600 layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4601
4602 layout << "// outputP.p[ width * height * primitiveStride + 1 ], one more for calculating subgroupID\n";
4603 layout << "layout(set=0, binding=3) coherent buffer OutputP { uint p[]; } outputP;\n";
4604
4605 layout << "layout(location = 0) out vec4 dEQP_FragColor;\n";
4606 break;
4607 case VK_SHADER_STAGE_VERTEX_BIT:
4608 layout << "layout(location = 0) in vec4 pos;\n";
4609 layout << "layout(set=0, binding=3) coherent buffer OutputP { uint p[]; } outputP;\n";
4610 layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4611 layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4612 layout << "layout(set=0, binding=0) coherent buffer InputA { uint a[]; } inputA;\n";
4613 break;
4614 case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
4615 layout << "#extension GL_EXT_tessellation_shader : require\n";
4616 layout << "layout(vertices = " << TessCtrlRandomProgram::minSubgroupSize << ") out;\n";
4617 layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4618 layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec2 b[]; } outputB;\n";
4619 layout << "layout(set=0, binding=0) coherent buffer InputA { uint a[]; } inputA;\n";
4620 break;
4621 case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
4622 layout << "#extension GL_EXT_tessellation_shader : require\n";
4623 layout << "layout(equal_spacing, quads) in;\n";
4624 layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4625 layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec2 b[]; } outputB;\n";
4626 layout << "layout(set=0, binding=0) coherent buffer InputA { uint a[]; } inputA;\n";
4627 break;
4628 case VK_SHADER_STAGE_GEOMETRY_BIT:
4629 layout << "#extension GL_EXT_geometry_shader : require\n";
4630 layout << "layout(points) in;\n";
4631 layout << "layout(points, max_vertices = 1) out;\n";
4632 layout << "layout(set=0, binding=3) coherent buffer OutputP { uint p[]; } outputP;\n";
4633 layout << "layout(set=0, binding=2) coherent buffer OutputC { uint loc[]; } outputC;\n";
4634 layout << "layout(set=0, binding=1) coherent buffer OutputB { uvec4 b[]; } outputB;\n";
4635 layout << "layout(set=0, binding=0) coherent buffer InputA { uint a[]; } inputA;\n";
4636 break;
4637 default:
4638 DE_ASSERT(0);
4639 }
4640
4641 std::stringstream pushConstantLayout;
4642 pushConstantLayout
4643 << "layout(push_constant) uniform PC {\n"
4644 " // set to the real stride when writing out ballots, or zero when just counting\n"
4645 " int invocationStride;\n"
4646 " // wildcard fields, for an example the dimensions of rendered area in the case of graphics shaders\n"
4647 " int width;\n"
4648 " int height;\n"
4649 " uint primitiveStride;\n"
4650 " uint subgroupStride;\n"
4651 " uint enableInvocationIndex;\n"
4652 "};\n";
4653 pushConstantLayout.flush();
4654 layout << pushConstantLayout.str();
4655
4656 globals << "int outLoc = 0;\n";
4657 globals << "bool testBit(uvec4 mask, uint bit) { return ((mask[bit / 32] >> (bit % 32)) & 1) != 0; }\n";
4658 globals << "uint elect() { return int(subgroupElect()) + 1; }\n";
4659 if (m_data.shaderStage == VK_SHADER_STAGE_FRAGMENT_BIT)
4660 {
4661 static const std::string helperRoutinesCode(R"glsl(
4662 void setBit(uint bit, in out uvec4 ballot) {
4663 uint c = bit / 32;
4664 switch (c) {
4665 case 0: ballot.x |= (1u << (bit % 32)); break;
4666 case 1: ballot.y |= (1u << (bit % 32)); break;
4667 case 2: ballot.z |= (1u << (bit % 32)); break;
4668 case 3: ballot.w |= (1u << (bit % 32)); break;
4669 }
4670 }
4671 void resetBit(uint bit, in out uvec4 ballot) {
4672 uint c = bit / 32;
4673 uint mask = 0xFFFFFFFF ^ (1u << (bit % 32));
4674 switch (c) {
4675 case 0: ballot.x &= mask; break;
4676 case 1: ballot.y &= mask; break;
4677 case 2: ballot.z &= mask; break;
4678 case 3: ballot.w &= mask; break;
4679 }
4680 }
4681 uint fragmentIndex() { return (uint(gl_FragCoord.y) * width + uint(gl_FragCoord.x)); }
4682 uint invocationIndex() { return subgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; }
4683 uvec4 invocationElectBallot() {
4684 uvec4 ballot = uvec4(0);
4685 ballot[gl_SubgroupInvocationID / 32] = (1 << (gl_SubgroupInvocationID % 32));
4686 return ballot;
4687 }
4688 uint next(uint hint) {
4689 return gl_HelperInvocation
4690 ? (hint * enableInvocationIndex)
4691 : outputC.loc[(gl_PrimitiveID * (subgroupStride * 128) + invocationIndex()) * enableInvocationIndex]++;
4692 }
4693 uint index(uint hint) {
4694 return ((
4695 next(hint) * (subgroupStride * 128 * primitiveStride)
4696 + (gl_PrimitiveID * subgroupStride * 128) + invocationIndex()) * enableInvocationIndex);
4697 }
4698 void storeValue(uint hintIndex, uvec4 value)
4699 {
4700 if (gl_HelperInvocation) {
4701 if (hintIndex < BALLOT_STACK_SIZE)
4702 ballotStack[hintIndex] = value;
4703 }
4704 else {
4705 outputB.b[index(hintIndex)] = value;
4706 }
4707 }
4708 void storeValue(uint hintIndex, uint value) { storeValue(hintIndex, uvec4(value, 0, 0, 0)); }
4709 void storeBallot(uint hintIndex) { storeValue(hintIndex, subgroupBallot(true)); }
4710 )glsl");
4711
4712 static const std::string prologueCode(R"glsl(
4713 uint helperInvocationCount = 0u;
4714 uint nonHelperInvocationCount = 0u;
4715 uvec4 helperInvocationsBits = uvec4(0, 0, 0, 0);
4716 uvec4 nonHelperInvocationsBits = uvec4(0, 0, 0, 0);
4717 if (gl_HelperInvocation)
4718 {
4719 helperInvocationsBits = subgroupBallot(true);
4720 helperInvocationCount = 1u;
4721 }
4722 else
4723 {
4724 nonHelperInvocationsBits = subgroupBallot(true);
4725 nonHelperInvocationCount = 1u;
4726 }
4727
4728 helperInvocationsBits = subgroupOr(helperInvocationsBits);
4729 nonHelperInvocationsBits = subgroupOr(nonHelperInvocationsBits);
4730 uint helperBitCount = subgroupBallotBitCount(helperInvocationsBits);
4731 uint nonHelperBitCount = subgroupBallotBitCount(nonHelperInvocationsBits);
4732 helperInvocationCount = subgroupAdd(helperInvocationCount);
4733 nonHelperInvocationCount = subgroupAdd(nonHelperInvocationCount);
4734
4735 const uint nonHelperElectBit = subgroupBallotFindLSB(nonHelperInvocationsBits);
4736 if (gl_SubgroupInvocationID == nonHelperElectBit)
4737 {
4738 subgroupID = atomicAdd(outputP.p[width * height * primitiveStride + 0], 1);
4739 outputP.p[width * height * primitiveStride + 1] = gl_SubgroupSize;
4740 atomicAdd(outputP.p[width * height * primitiveStride + 2], nonHelperInvocationCount);
4741 atomicAdd(outputP.p[width * height * primitiveStride + 3], helperInvocationCount);
4742 }
4743
4744 subgroupID = subgroupShuffle(subgroupID, nonHelperElectBit);
4745
4746 const uint localPrimitiveID = gl_PrimitiveID;
4747 const uint localFragmentID = fragmentIndex();
4748
4749 if (!gl_HelperInvocation)
4750 {
4751 outputP.p[localFragmentID * primitiveStride + localPrimitiveID] =
4752 ((subgroupID + 1) << 16) | gl_SubgroupInvocationID;
4753 }
4754
4755 // Maping helper invocations block
4756 {
4757 uvec4 tmpHelperBits = helperInvocationsBits;
4758 uint helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4759 while (subgroupBallotBitExtract(tmpHelperBits, helperSubgroupInvocationID))
4760 {
4761 uint helperSubgroupID = subgroupShuffle(subgroupID, helperSubgroupInvocationID);
4762 uint helperFragmentID = subgroupShuffle(localFragmentID, helperSubgroupInvocationID);
4763 uint helperPrimitiveID = subgroupShuffle(localPrimitiveID, helperSubgroupInvocationID);
4764 if (gl_SubgroupInvocationID == nonHelperElectBit)
4765 {
4766 outputP.p[helperFragmentID * primitiveStride + helperPrimitiveID] =
4767 (((helperSubgroupID + 1) | 0x8000) << 16) | helperSubgroupInvocationID;
4768 }
4769 resetBit(helperSubgroupInvocationID, tmpHelperBits);
4770 helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4771 }
4772 }
4773 )glsl");
4774
4775 static const std::string epilogueCode(R"glsl(
4776 // Save helper invocations entries block
4777 {
4778 uvec4 tmpHelperBits = subgroupOr(helperInvocationsBits);
4779 uint helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4780 while (helperSubgroupInvocationID < gl_SubgroupSize)
4781 {
4782 const uint maxOutLoc = subgroupShuffle(outLoc, helperSubgroupInvocationID);
4783 if (maxOutLoc == 0)
4784 {
4785 resetBit(helperSubgroupInvocationID, tmpHelperBits);
4786 helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4787 continue;
4788 }
4789
4790 uvec4 helperBallotStack[BALLOT_STACK_SIZE];
4791 uint helperSubgroupID = subgroupShuffle(subgroupID, helperSubgroupInvocationID);
4792 uint helperFragmentID = subgroupShuffle(localFragmentID, helperSubgroupInvocationID);
4793 uint helperPrimitiveID = subgroupShuffle(localPrimitiveID, helperSubgroupInvocationID);
4794 for (uint i = 0; i < maxOutLoc && i < BALLOT_STACK_SIZE; i++) {
4795 helperBallotStack[i] = subgroupShuffle(ballotStack[i], helperSubgroupInvocationID);
4796 }
4797
4798 if (gl_SubgroupInvocationID == nonHelperElectBit)
4799 {
4800 uint helperInvocationIndex = helperSubgroupID * gl_SubgroupSize + helperSubgroupInvocationID;
4801 uint helperPrimitiveInvocationIndex = helperInvocationIndex * primitiveStride + helperPrimitiveID;
4802
4803 outputC.loc[(helperInvocationIndex * primitiveStride + helperPrimitiveID) * enableInvocationIndex] = maxOutLoc;
4804
4805 for (uint j = 0; j < maxOutLoc; j++)
4806 {
4807 uint outputIndex = ((j * (subgroupStride * 128u * primitiveStride)
4808 + (helperPrimitiveID * subgroupStride * 128u) + helperInvocationIndex) * enableInvocationIndex);
4809 uvec4 outputValue = (j < BALLOT_STACK_SIZE) ? helperBallotStack[j] : uvec4(0,0,0,0);
4810 outputB.b[outputIndex] = outputValue;
4811 }
4812 }
4813 resetBit(helperSubgroupInvocationID, tmpHelperBits);
4814 helperSubgroupInvocationID = subgroupBallotFindLSB(tmpHelperBits);
4815 } // wend
4816 }
4817
4818 dEQP_FragColor = vec4(1.0);
4819 )glsl");
4820
4821 header << "#extension GL_KHR_shader_subgroup_shuffle : enable\n";
4822 header << "#extension GL_KHR_shader_subgroup_arithmetic : enable\n";
4823 header << "#define BALLOT_STACK_SIZE " << FragmentRandomProgram::experimentalOutLocSize << '\n';
4824
4825 {
4826 aux << header.str();
4827 aux << pushConstantLayout.str();
4828 aux << "uint outLoc = 0;\n";
4829 aux << "struct OutputC { uint loc[1]; };\n";
4830 aux << "struct OutputB { uvec4 b[1]; };\n";
4831 aux << "uint subgroupID = 11111;\n";
4832 aux << "uvec4 ballotStack[BALLOT_STACK_SIZE];\n";
4833 aux << "OutputC outputC;\n";
4834 aux << "OutputB outputB;\n";
4835 aux << "// OutputP.p[ width * height * primitiveStride + 4 ], few more for calculating subgroupID, "
4836 "subgroupSize, non-helper and helper invocations\n";
4837 aux << "layout(set = 0, binding = 0) coherent buffer OutputP { uint p[]; } outputP;\n";
4838 aux << "layout(location = 0) out vec4 dEQP_FragColor;\n";
4839 aux << helperRoutinesCode;
4840 aux << "void main() {\n"
4841 << prologueCode << epilogueCode << " \n"
4842 << "}\n";
4843 }
4844
4845 globals << "uint subgroupID = 22222;\n";
4846 globals << "uvec4 ballotStack[BALLOT_STACK_SIZE];\n";
4847 globals << helperRoutinesCode;
4848
4849 prologue << prologueCode;
4850 epilogue << epilogueCode;
4851 }
4852 else if (m_data.shaderStage == VK_SHADER_STAGE_VERTEX_BIT)
4853 {
4854 static const std::string helperRoutinesCode(R"glsl(
4855 uint invocationIndex() { return subgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; }
4856 uvec4 invocationElectBallot() {
4857 uvec4 ballot = uvec4(0);
4858 ballot[gl_SubgroupInvocationID / 32] = (1 << (gl_SubgroupInvocationID % 32));
4859 return ballot;
4860 }
4861 void storeValue(uint loc, uvec4 value) {
4862 outputC.loc[gl_VertexIndex] = loc + 1u;
4863 outputB.b[(loc * invocationStride + gl_VertexIndex) * enableInvocationIndex] = value;
4864 }
4865 void storeValue(uint loc, uint value) { storeValue(loc, uvec4(value, 0, 0, 0)); }
4866 )glsl");
4867
4868 static const std::string prologueCode(R"glsl(
4869 uint invocationCount = 1u;
4870 invocationCount = subgroupAdd(invocationCount);
4871
4872 if (subgroupElect())
4873 {
4874 subgroupID = atomicAdd(outputP.p[NUM_SUBGROUPS_OFFSET], 1u); // [+0] subgroupID
4875 outputP.p[SUBGROUP_SIZE_OFFSET] = gl_SubgroupSize; // [+1] subgroupSize
4876 atomicAdd(outputP.p[INVOCATION_COUNT_OFFSET], invocationCount); // [+2] invocationCount
4877 }
4878 subgroupID = subgroupBroadcastFirst(subgroupID);
4879
4880 outputP.p[gl_VertexIndex + INVOCATION_ENTRIES_OFFSET] = ((subgroupID + 1) << 16) | gl_SubgroupInvocationID;
4881 )glsl");
4882
4883 static const std::string epilogueCode(R"glsl(
4884 gl_Position = vec4(pos.xy, 0.0, 1.0);
4885 gl_PointSize = 1.0;
4886 )glsl");
4887
4888 header << "#extension GL_KHR_shader_subgroup_arithmetic : enable\n";
4889 header << "#define NUM_SUBGROUPS_OFFSET 0\n";
4890 header << "#define SUBGROUP_SIZE_OFFSET 1\n";
4891 header << "#define INVOCATION_COUNT_OFFSET 2\n";
4892 header << "#define INVOCATION_ENTRIES_OFFSET 3\n";
4893
4894 globals << "uint subgroupID = 33333;\n";
4895 globals << helperRoutinesCode;
4896
4897 prologue << prologueCode;
4898 epilogue << epilogueCode;
4899 }
4900 else if (m_data.shaderStage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT)
4901 {
4902 // push_constant::width holds the smallest subgroup size defined in TessCtrlRandomProgram::minSubgroupSize
4903 globals << "// push_constant::width is the smallest subgroup size which this shader is run on\n";
4904 globals << "uint invocationIndex() { return ((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) "
4905 "+ gl_SubgroupInvocationID); }\n";
4906
4907 epilogue
4908 << " gl_out[gl_InvocationID].gl_Position = gl_in[gl_InvocationID % gl_PatchVerticesIn].gl_Position;\n";
4909 epilogue << " gl_TessLevelOuter[0] = 1.0;\n";
4910 epilogue << " gl_TessLevelOuter[1] = 1.0;\n";
4911 epilogue << " gl_TessLevelOuter[2] = 1.0;\n";
4912 epilogue << " gl_TessLevelOuter[3] = 1.0;\n";
4913 epilogue << " gl_TessLevelInner[0] = 1.0;\n";
4914 epilogue << " gl_TessLevelInner[1] = 1.0;\n";
4915 }
4916 else if (m_data.shaderStage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)
4917 {
4918 globals << "// push_constant::width is an invocation count when processing a quad for a single patch\n";
4919 globals << "uint invocationIndex() { return ((((gl_PrimitiveID * width) / gl_SubgroupSize) * gl_SubgroupSize) "
4920 "+ gl_SubgroupInvocationID); }\n";
4921
4922 epilogue << " float u = gl_TessCoord.x;\n";
4923 epilogue << " float v = gl_TessCoord.y;\n";
4924 epilogue << " float w = gl_TessCoord.z;\n";
4925 epilogue << " vec4 p0 = vec4(gl_in[0].gl_Position.xy, 0.0, 1.0);\n";
4926 epilogue << " vec4 p1 = vec4(gl_in[1].gl_Position.xy, 0.0, 1.0);\n";
4927 epilogue << " vec4 p2 = vec4(gl_in[2].gl_Position.xy, 0.0, 1.0);\n";
4928 epilogue << " gl_Position = u * p0 + v * p1 + w * p2;\n";
4929 }
4930 else if (m_data.shaderStage == VK_SHADER_STAGE_GEOMETRY_BIT)
4931 {
4932 static const std::string helperRoutinesCode(R"glsl(
4933 uint invocationIndex() { return subgroupID * gl_SubgroupSize + gl_SubgroupInvocationID; }
4934 void storeValue(uint loc, uvec4 value) {
4935 outputC.loc[gl_PrimitiveIDIn] = loc + 1u;
4936 outputB.b[(loc * invocationStride + gl_PrimitiveIDIn) * enableInvocationIndex] = value;
4937 }
4938 void storeValue(uint loc, uint value) { storeValue(loc, uvec4(value, 0, 0, 0)); }
4939 void storeBallot(uint loc) { storeValue(loc, subgroupBallot(true)); }
4940 uvec4 invocationElectBallot() {
4941 uvec4 ballot = uvec4(0);
4942 ballot[gl_SubgroupInvocationID / 32] = (1 << (gl_SubgroupInvocationID % 32));
4943 return ballot;
4944 }
4945 )glsl");
4946
4947 static const std::string prologueCode(R"glsl(
4948 uint invocationCount = 1u;
4949 invocationCount = subgroupAdd(invocationCount);
4950 uint identity = gl_PrimitiveIDIn + 1u;
4951 uint maxIdentity = subgroupMax(identity);
4952
4953 if (subgroupElect()) {
4954 subgroupID = atomicAdd(outputP.p[SUBGROUP_ID_OFFSET], 1u); // [+0] subgroupID
4955 outputP.p[SUBGROUP_SIZE_OFFSET] = gl_SubgroupSize; // [+1] subgroupSize
4956 atomicAdd(outputP.p[INVOCATION_COUNT_OFFSET], invocationCount); // [+2] invocationCount
4957 atomicMax(outputP.p[MAX_IDENTITY_OFFSET], maxIdentity);
4958 }
4959 subgroupID = subgroupBroadcastFirst(subgroupID);
4960
4961 outputP.p[gl_PrimitiveIDIn + INVOCATION_ENTRY_OFFSET] = ((subgroupID + 1) << 16) | gl_SubgroupInvocationID;
4962
4963 )glsl");
4964
4965 static const std::string epilogueCode(R"glsl(
4966 uint maxLoc = subgroupMax(outLoc);
4967 atomicMax(outputP.p[MAX_LOC_OFFSET], maxLoc);
4968
4969 gl_Position = gl_in[gl_PrimitiveIDIn].gl_Position;
4970 gl_PrimitiveID = gl_PrimitiveIDIn;
4971
4972 EmitVertex();
4973 EndPrimitive();
4974 )glsl");
4975
4976 header << "#extension GL_KHR_shader_subgroup_arithmetic : enable\n";
4977 header << "#define SUBGROUP_ID_OFFSET 0\n";
4978 header << "#define SUBGROUP_SIZE_OFFSET 1\n";
4979 header << "#define INVOCATION_COUNT_OFFSET 2\n";
4980 header << "#define MAX_LOC_OFFSET 3\n";
4981 header << "#define MAX_IDENTITY_OFFSET 4\n";
4982 header << "#define INVOCATION_ENTRY_OFFSET 5\n";
4983
4984 globals << "uint subgroupID;\n";
4985 globals << "uint numSubgroups;\n";
4986 globals << helperRoutinesCode;
4987
4988 prologue << prologueCode;
4989 epilogue << epilogueCode;
4990 }
4991
4992 std::stringstream css, functions, main;
4993 m_program->printCode(functions, main);
4994
4995 css << header.str();
4996 css << layout.str();
4997 css << globals.str();
4998
4999 css << functions.str() << "\n\n";
5000
5001 css << "void main()\n"
5002 << (m_data.isSUCF() ? "[[subgroup_uniform_control_flow]]\n" : "")
5003 << (m_data.testType == TT_MAXIMAL ? "[[maximally_reconverges]]\n" : "") << "{\n";
5004
5005 css << prologue.str() << "\n";
5006 css << main.str() << "\n\n";
5007 css << epilogue.str() << "\n";
5008
5009 css << "}\n";
5010
5011 const vk::ShaderBuildOptions buildOptions(programCollection.usedVulkanVersion, vk::SPIRV_VERSION_1_3, 0u);
5012
5013 auto &testingShader = programCollection.glslSources.add("test");
5014 switch (m_data.shaderStage)
5015 {
5016 case VK_SHADER_STAGE_COMPUTE_BIT:
5017 testingShader << glu::ComputeSource(css.str()) << buildOptions;
5018 break;
5019 case VK_SHADER_STAGE_FRAGMENT_BIT:
5020 testingShader << glu::FragmentSource(css.str()) << buildOptions;
5021 programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource()) << buildOptions;
5022 programCollection.glslSources.add("aux") << glu::FragmentSource(aux.str()) << buildOptions;
5023 break;
5024 case VK_SHADER_STAGE_VERTEX_BIT:
5025 testingShader << glu::VertexSource(css.str()) << buildOptions;
5026 programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5027 break;
5028 case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
5029 testingShader << glu::TessellationControlSource(css.str()) << buildOptions;
5030 programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource());
5031 programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5032 programCollection.glslSources.add("tese") << glu::TessellationEvaluationSource(genPassThroughTessEvalSource());
5033 break;
5034 case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
5035 testingShader << glu::TessellationEvaluationSource(css.str()) << buildOptions;
5036 programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource());
5037 programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5038 programCollection.glslSources.add("tesc") << glu::TessellationControlSource(genPassThroughTessCtrlSource());
5039 break;
5040 case VK_SHADER_STAGE_GEOMETRY_BIT:
5041 testingShader << glu::GeometrySource(css.str()) << buildOptions;
5042 programCollection.glslSources.add("vert") << glu::VertexSource(genPassThroughVertexSource());
5043 programCollection.glslSources.add("frag") << glu::FragmentSource(genPassThroughFragmentSource());
5044 break;
5045 default:
5046 DE_ASSERT(0);
5047 }
5048 }
5049
createInstance(Context & context) const5050 TestInstance *ReconvergenceTestCase::createInstance(Context &context) const
5051 {
5052 switch (m_data.shaderStage)
5053 {
5054 case VK_SHADER_STAGE_COMPUTE_BIT:
5055 return new ReconvergenceTestComputeInstance(context, m_data, m_program, std::move(m_subgroupSizeToMaxLoc));
5056 case VK_SHADER_STAGE_FRAGMENT_BIT:
5057 return new ReconvergenceTestFragmentInstance(context, m_data);
5058 case VK_SHADER_STAGE_VERTEX_BIT:
5059 return new ReconvergenceTestVertexInstance(context, m_data);
5060 case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
5061 return new ReconvergenceTestTessCtrlInstance(context, m_data);
5062 case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
5063 return new ReconvergenceTestTessEvalInstance(context, m_data);
5064 case VK_SHADER_STAGE_GEOMETRY_BIT:
5065 return new ReconvergenceTestGeometryInstance(context, m_data);
5066 default:
5067 DE_ASSERT(false);
5068 }
5069 return nullptr;
5070 }
5071
iterate(void)5072 tcu::TestStatus ReconvergenceTestComputeInstance::iterate(void)
5073 {
5074 const DeviceInterface &vk = m_context.getDeviceInterface();
5075 const VkDevice device = m_context.getDevice();
5076 Allocator &allocator = m_context.getDefaultAllocator();
5077 tcu::TestLog &log = m_context.getTestContext().getLog();
5078 const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
5079
5080 const uint32_t invocationStride = m_data.sizeX * m_data.sizeY;
5081
5082 std::vector<tcu::UVec4> ref;
5083 add_ref<ComputeRandomProgram> program(*m_program);
5084
5085 uint32_t precalculatedMaxLoc = 0u;
5086 if (auto itPrecalculatedMaxLoc = m_subgroupSizeToMaxLoc.find(m_subgroupSize);
5087 itPrecalculatedMaxLoc != m_subgroupSizeToMaxLoc.end())
5088 {
5089 precalculatedMaxLoc = itPrecalculatedMaxLoc->second;
5090 }
5091 uint32_t maxLoc = precalculatedMaxLoc ? precalculatedMaxLoc :
5092 program.execute(m_context.getTestContext().getWatchDog(), true,
5093 m_subgroupSize, 0u, invocationStride, ref, log);
5094 uint32_t shaderMaxLoc = maxLoc;
5095
5096 // maxLoc is per-invocation. Add one (to make sure no additional writes are done) and multiply by
5097 // the number of invocations
5098 maxLoc++;
5099 maxLoc *= invocationStride;
5100
5101 // buffer[0] is an input filled with a[i] == i
5102 // buffer[1] is the output
5103 // buffer[2] is the location counts
5104 de::MovePtr<BufferWithMemory> buffers[3];
5105 vk::VkDescriptorBufferInfo bufferDescriptors[3];
5106
5107 VkDeviceSize sizes[3] = {
5108 invocationStride * sizeof(uint32_t),
5109 maxLoc * sizeof(tcu::UVec4),
5110 invocationStride * sizeof(uint32_t),
5111 };
5112
5113 for (uint32_t i = 0; i < 3; ++i)
5114 {
5115 if (sizes[i] > limits.maxStorageBufferRange)
5116 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
5117
5118 try
5119 {
5120 buffers[i] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
5121 vk, device, allocator,
5122 makeBufferCreateInfo(sizes[i], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
5123 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
5124 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
5125 }
5126 catch (tcu::ResourceError &)
5127 {
5128 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5129 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
5130 "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
5131 }
5132 bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
5133 }
5134
5135 void *ptrs[3];
5136 for (uint32_t i = 0; i < 3; ++i)
5137 {
5138 ptrs[i] = buffers[i]->getAllocation().getHostPtr();
5139 }
5140 for (uint32_t i = 0; i < sizes[0] / sizeof(uint32_t); ++i)
5141 {
5142 ((uint32_t *)ptrs[0])[i] = i;
5143 }
5144 deMemset(ptrs[1], 0, (size_t)sizes[1]);
5145 deMemset(ptrs[2], 0, (size_t)sizes[2]);
5146
5147 vk::DescriptorSetLayoutBuilder layoutBuilder;
5148
5149 layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_data.shaderStage);
5150 layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_data.shaderStage);
5151 layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, m_data.shaderStage);
5152
5153 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
5154
5155 vk::Unique<vk::VkDescriptorPool> descriptorPool(
5156 vk::DescriptorPoolBuilder()
5157 .addType(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 3u)
5158 .build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
5159 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
5160
5161 const VkPushConstantRange pushConstantRange = {
5162 (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
5163 0u, // uint32_t offset;
5164 sizeof(PushConstant) // uint32_t size;
5165 };
5166
5167 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = {
5168 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
5169 DE_NULL, // pNext
5170 (VkPipelineLayoutCreateFlags)0,
5171 1, // setLayoutCount
5172 &descriptorSetLayout.get(), // pSetLayouts
5173 1u, // pushConstantRangeCount
5174 &pushConstantRange, // pPushConstantRanges
5175 };
5176
5177 flushAlloc(vk, device, buffers[0]->getAllocation());
5178 flushAlloc(vk, device, buffers[1]->getAllocation());
5179 flushAlloc(vk, device, buffers[2]->getAllocation());
5180
5181 const VkPipelineBindPoint bindPoint = VK_PIPELINE_BIND_POINT_COMPUTE;
5182 const Unique<VkShaderModule> shader(createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0));
5183 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
5184 Move<VkPipeline> pipeline = createComputePipeline(*pipelineLayout, *shader);
5185 const VkQueue queue = m_context.getUniversalQueue();
5186 Move<VkCommandPool> cmdPool = createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
5187 m_context.getUniversalQueueFamilyIndex());
5188 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
5189
5190 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
5191 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(0),
5192 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[0]);
5193 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(1),
5194 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[1]);
5195 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(2),
5196 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[2]);
5197 setUpdateBuilder.update(vk, device);
5198
5199 PushConstant pc{/* pcinvocationStride is initialized with 0, the rest of fields as well */};
5200
5201 // compute "maxLoc", the maximum number of locations written
5202 beginCommandBuffer(vk, *cmdBuffer, 0u);
5203 vk.cmdBindDescriptorSets(*cmdBuffer, bindPoint, *pipelineLayout, 0u, 1, &*descriptorSet, 0u, DE_NULL);
5204 vk.cmdBindPipeline(*cmdBuffer, bindPoint, *pipeline);
5205 vk.cmdPushConstants(*cmdBuffer, *pipelineLayout, m_data.shaderStage, 0, sizeof(pc), &pc);
5206 vk.cmdDispatch(*cmdBuffer, 1, 1, 1);
5207 endCommandBuffer(vk, *cmdBuffer);
5208
5209 submitCommandsAndWait(vk, device, queue, cmdBuffer.get());
5210
5211 invalidateAlloc(vk, device, buffers[1]->getAllocation());
5212 invalidateAlloc(vk, device, buffers[2]->getAllocation());
5213
5214 // Take the max over all invocations. Add one (to make sure no additional writes are done) and multiply by
5215 // the number of invocations
5216 uint32_t newMaxLoc = 0;
5217 for (uint32_t id = 0; id < invocationStride; ++id)
5218 newMaxLoc = de::max(newMaxLoc, ((uint32_t *)ptrs[2])[id]);
5219 shaderMaxLoc = newMaxLoc;
5220 newMaxLoc++;
5221 newMaxLoc *= invocationStride;
5222
5223 // If we need more space, reallocate buffers[1]
5224 if (newMaxLoc > maxLoc)
5225 {
5226 maxLoc = newMaxLoc;
5227 sizes[1] = maxLoc * sizeof(tcu::UVec4);
5228
5229 if (sizes[1] > limits.maxStorageBufferRange)
5230 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
5231
5232 try
5233 {
5234 buffers[1] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
5235 vk, device, allocator,
5236 makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
5237 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
5238 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
5239 }
5240 catch (tcu::ResourceError &)
5241 {
5242 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5243 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
5244 "Failed device memory allocation " + de::toString(sizes[1]) + " bytes");
5245 }
5246 bufferDescriptors[1] = makeDescriptorBufferInfo(**buffers[1], 0, sizes[1]);
5247 ptrs[1] = buffers[1]->getAllocation().getHostPtr();
5248
5249 vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
5250 setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(1),
5251 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[1]);
5252 setUpdateBuilder2.update(vk, device);
5253 }
5254
5255 // Clear any writes to buffer[1] during the counting pass
5256 deMemset(ptrs[1], 0, (size_t)sizes[1]);
5257 flushAlloc(vk, device, buffers[1]->getAllocation());
5258 // Clear any writes to buffer[2] during the counting pass
5259 deMemset(ptrs[2], 0, (size_t)sizes[2]);
5260 flushAlloc(vk, device, buffers[2]->getAllocation());
5261
5262 // change invocationStride value in shader
5263 pc.invocationStride = invocationStride;
5264
5265 // run the actual shader
5266 beginCommandBuffer(vk, *cmdBuffer, 0u);
5267 vk.cmdBindDescriptorSets(*cmdBuffer, bindPoint, *pipelineLayout, 0u, 1, &*descriptorSet, 0u, DE_NULL);
5268 vk.cmdBindPipeline(*cmdBuffer, bindPoint, *pipeline);
5269 vk.cmdPushConstants(*cmdBuffer, *pipelineLayout, m_data.shaderStage, 0, sizeof(pc), &pc);
5270 vk.cmdDispatch(*cmdBuffer, 1, 1, 1);
5271 endCommandBuffer(vk, *cmdBuffer);
5272
5273 submitCommandsAndWait(vk, device, queue, cmdBuffer.get());
5274
5275 invalidateAlloc(vk, device, buffers[1]->getAllocation());
5276
5277 // Simulate execution on the CPU, and compare against the GPU result
5278 try
5279 {
5280 ref.resize(maxLoc, tcu::UVec4());
5281 }
5282 catch (const std::bad_alloc &)
5283 {
5284 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5285 return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
5286 "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
5287 }
5288
5289 program.execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize, 0u, invocationStride, ref, log);
5290
5291 const tcu::UVec4 *result = (const tcu::UVec4 *)ptrs[1];
5292
5293 qpTestResult res = calculateAndLogResult(result, ref, invocationStride, m_subgroupSize, shaderMaxLoc);
5294
5295 return tcu::TestStatus(res, qpGetTestResultName(res));
5296 }
5297
calculateAndLogResult(const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,uint32_t invocationStride,uint32_t subgroupSize,uint32_t shaderMaxLoc)5298 qpTestResult_e ReconvergenceTestComputeInstance::calculateAndLogResult(const tcu::UVec4 *result,
5299 const std::vector<tcu::UVec4> &ref,
5300 uint32_t invocationStride, uint32_t subgroupSize,
5301 uint32_t shaderMaxLoc)
5302 {
5303 const uint32_t maxLoc = static_cast<uint32_t>(ref.size());
5304 tcu::TestLog &log = m_context.getTestContext().getLog();
5305 qpTestResult res = QP_TEST_RESULT_PASS;
5306 DE_ASSERT(subgroupSize * shaderMaxLoc <= maxLoc);
5307 DE_UNREF(shaderMaxLoc);
5308
5309 uint32_t mismatchCount = 0u;
5310 const uint32_t printMismatchCount = 5u;
5311 if (m_data.testType == TT_MAXIMAL)
5312 {
5313 // With maximal reconvergence, we should expect the output to exactly match
5314 // the reference.
5315 for (uint32_t i = 0; i < maxLoc; ++i)
5316 {
5317 const Ballot resultVal(result[i], subgroupSize);
5318 const Ballot refVal(ref[i], subgroupSize);
5319 if (resultVal != refVal)
5320 {
5321 res = QP_TEST_RESULT_FAIL;
5322 if (mismatchCount++ < printMismatchCount)
5323 {
5324 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
5325 << "\n got: " << refVal << tcu::TestLog::EndMessage;
5326 }
5327 else
5328 break;
5329 }
5330 }
5331
5332 #if 0 // This log can be large and slow, ifdef it out by default
5333 log << tcu::TestLog::Message << "subgroupSize:" << subgroupSize << ", invocationStride:" << invocationStride << ", maxLoc:" << shaderMaxLoc << tcu::TestLog::EndMessage;
5334 uint32_t invMax = std::min(invocationStride, 50u);
5335 for (uint32_t inv = 0; inv < invMax; ++inv)
5336 {
5337 auto ll = log << tcu::TestLog::Message;
5338 ll << inv << ": ";
5339 for (uint32_t loc = 0; loc < shaderMaxLoc; ++loc)
5340 {
5341 uint64_t entry = result[loc * invocationStride + inv];
5342 ll << de::toString(loc) << ":" << tcu::toHex(entry) << ' ';
5343 }
5344 ll << tcu::TestLog::EndMessage;
5345 }
5346 #endif
5347
5348 if (res != QP_TEST_RESULT_PASS)
5349 {
5350 for (uint32_t i = 0; i < maxLoc; ++i)
5351 {
5352 #if 0
5353 // This log can be large and slow, ifdef it out by default
5354 const Ballot resultVal(result[i], subgroupSize);
5355 const Ballot refVal(ref[i], subgroupSize);
5356 log << tcu::TestLog::Message << "result " << i << "(" << (i / invocationStride) << ", " << (i % invocationStride) << "): " << resultVal << " ref " << refVal << (resultVal != refVal ? " different" : "") << tcu::TestLog::EndMessage;
5357 #endif
5358 }
5359 }
5360 }
5361 else
5362 {
5363 DE_ASSERT(subgroupSize != 0);
5364
5365 Ballot fullMask = subgroupSizeToMask(subgroupSize, 0 /* ignored */);
5366 // For subgroup_uniform_control_flow, we expect any fully converged outputs in the reference
5367 // to have a corresponding fully converged output in the result. So walk through each lane's
5368 // results, and for each reference value of fullMask, find a corresponding result value of
5369 // fullMask where the previous value (OP_STORE) matches. That means these came from the same
5370 // source location.
5371 vector<uint32_t> firstFail(invocationStride, 0);
5372 for (uint32_t lane = 0; lane < invocationStride; ++lane)
5373 {
5374 uint32_t resLoc = lane + invocationStride, refLoc = lane + invocationStride;
5375 while (refLoc < maxLoc)
5376 {
5377 while (refLoc < maxLoc && ref[refLoc] != fullMask)
5378 refLoc += invocationStride;
5379 if (refLoc >= maxLoc)
5380 break;
5381
5382 // For TT_SUCF_ELECT, when the reference result has a full mask, we expect lane 0 to be elected
5383 // (a value of 2) and all other lanes to be not elected (a value of 1). For TT_SUCF_BALLOT, we
5384 // expect a full mask. Search until we find the expected result with a matching store value in
5385 // the previous result.
5386 Ballot expectedResult = m_data.isElect() ? Ballot((lane % m_subgroupSize) == 0 ? 2 : 1) : fullMask;
5387
5388 while (resLoc < maxLoc && !(result[resLoc] == expectedResult &&
5389 result[resLoc - invocationStride] == ref[refLoc - invocationStride]))
5390 resLoc += invocationStride;
5391
5392 // If we didn't find this output in the result, flag it as an error.
5393 if (resLoc >= maxLoc)
5394 {
5395 firstFail[lane] = refLoc;
5396 log << tcu::TestLog::Message << "lane " << lane << " first mismatch at " << firstFail[lane]
5397 << tcu::TestLog::EndMessage;
5398 res = QP_TEST_RESULT_FAIL;
5399 break;
5400 }
5401 refLoc += invocationStride;
5402 resLoc += invocationStride;
5403 }
5404 }
5405
5406 if (res != QP_TEST_RESULT_PASS)
5407 {
5408 for (uint32_t i = 0; i < maxLoc; ++i)
5409 {
5410 // This log can be large and slow, ifdef it out by default
5411 #if 0
5412 log << tcu::TestLog::Message << "result " << i << "(" << (i / invocationStride) << ", " << (i % invocationStride) << "): " << tcu::toHex(result[i]) << " ref " << tcu::toHex(ref[i]) << (i == firstFail[i % invocationStride] ? " first fail" : "") << tcu::TestLog::EndMessage;
5413 #endif
5414 }
5415 }
5416 }
5417
5418 return res;
5419 }
5420
makeRenderPassBeginInfo(const VkRenderPass renderPass,const VkFramebuffer framebuffer)5421 VkRenderPassBeginInfo ReconvergenceTestGraphicsInstance::makeRenderPassBeginInfo(const VkRenderPass renderPass,
5422 const VkFramebuffer framebuffer)
5423 {
5424 static const VkClearValue clearValue{{{0u, 0u, 0u, 0u}}};
5425 return {
5426 VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, // VkStructureType sType;
5427 nullptr, // const void* pNext;
5428 renderPass, // VkRenderPass renderPass;
5429 framebuffer, // VkFramebuffer framebuffer;
5430 makeRect2D(m_data.sizeX, m_data.sizeY), // VkRect2D renderArea;
5431 1u, // uint32_t clearValueCount;
5432 &clearValue // const VkClearValue* pClearValues;
5433 };
5434 }
5435
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)5436 de::MovePtr<BufferWithMemory> ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(
5437 uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
5438 {
5439 uint32_t vertexCount = cellsHorz * cellsVert;
5440 uint32_t triangleCount = cellsHorz * cellsVert;
5441 switch (topology)
5442 {
5443 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
5444 vertexCount = triangleCount * 3;
5445 break;
5446 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
5447 vertexCount = triangleCount - 1 + 3;
5448 break;
5449 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
5450 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
5451 triangleCount = vertexCount - 3 + 1;
5452 break;
5453 default:
5454 DE_ASSERT(0);
5455 }
5456
5457 const DeviceInterface &vk = m_context.getDeviceInterface();
5458 const VkDevice device = m_context.getDevice();
5459 Allocator &allocator = m_context.getDefaultAllocator();
5460 const VkDeviceSize bufferSize = VkDeviceSize(vertexCount) * sizeof(Vertex);
5461 const VkBufferUsageFlags bufferUsage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
5462 const VkBufferCreateInfo createInfo = makeBufferCreateInfo(bufferSize, bufferUsage);
5463 const MemoryRequirement memoryReqs = (MemoryRequirement::HostVisible | MemoryRequirement::Coherent);
5464 de::MovePtr<BufferWithMemory> buffer(new BufferWithMemory(vk, device, allocator, createInfo, memoryReqs));
5465 Allocation &allocation = buffer->getAllocation();
5466 Vertex *vertices = static_cast<Vertex *>(allocation.getHostPtr());
5467
5468 if (VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST == topology)
5469 {
5470 const float stepX = 2.0f / float(cellsHorz);
5471 const float stepY = 2.0f / float(cellsVert);
5472
5473 uint32_t t = 0;
5474 float y = -1.0f;
5475 for (uint32_t h = 0; h < cellsVert; ++h)
5476 {
5477 float x = -1.0f;
5478 const float yy = y + stepY;
5479 for (uint32_t w = 0; w < cellsHorz; ++w)
5480 {
5481 const float xx = x + stepX;
5482
5483 vertices[t++] = {x, yy, 0.f, 0.f};
5484 vertices[t++] = {((xx + x) / 2.f), y, 0.f, 0.f};
5485 vertices[t++] = {xx, ((yy + y) / 2.f), 0.f, 0.f};
5486
5487 x = xx;
5488 }
5489 y = yy;
5490 }
5491 DE_ASSERT(vertexCount == t);
5492 }
5493 else
5494 {
5495 const uint32_t div = static_cast<uint32_t>(ROUNDUP(triangleCount, 2) / 2);
5496 const float step = 2.0f / static_cast<float>(div);
5497
5498 uint32_t t = 0;
5499 float x = -1.0f;
5500 for (uint32_t i = 0; i < div; ++i)
5501 {
5502 const bool last = ((div - i) == 1u);
5503 const float xNext = last ? +1.0f : (x + step);
5504
5505 const Vertex v0{x, +1.0f, 0.0f, 0.0f};
5506 const Vertex v1{xNext, +1.0f, 0.0f, 0.0f};
5507 const Vertex v2{xNext, -1.0f, 0.0f, 0.0f};
5508 const Vertex v3{x, -1.0f, 0.0f, 0.0f};
5509
5510 if (t == 0)
5511 {
5512 vertices[0] = v0;
5513 vertices[1] = v3;
5514 vertices[2] = v1;
5515
5516 t = 3;
5517 }
5518 else
5519 {
5520 vertices[t++] = v1;
5521 }
5522
5523 if (!last || !(triangleCount % 2))
5524 {
5525 vertices[t++] = v2;
5526 }
5527
5528 x += step;
5529 }
5530 DE_ASSERT(vertexCount == t);
5531 }
5532
5533 flushAlloc(vk, device, allocation);
5534 return buffer;
5535 }
generateVertices(const uint32_t primitiveCount,const VkPrimitiveTopology topology,const uint32_t patchSize)5536 std::vector<tcu::Vec4> ReconvergenceTestGraphicsInstance::generateVertices(const uint32_t primitiveCount,
5537 const VkPrimitiveTopology topology,
5538 const uint32_t patchSize)
5539 {
5540 auto cast = [](const float f) -> float { return ((f * 2.0f) - 1.0f); };
5541 auto bestRect = [](const uint32_t c) -> std::pair<uint32_t, uint32_t>
5542 {
5543 uint32_t a = 1;
5544 uint32_t b = 1;
5545 do
5546 {
5547 a = a + 1;
5548 b = (c / a) + ((c % a) ? 1 : 0);
5549 } while (a < b);
5550 return {a, b};
5551 };
5552
5553 uint32_t triangleCount = 0;
5554 uint32_t vertexCount = 0;
5555 switch (topology)
5556 {
5557 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
5558 triangleCount = primitiveCount;
5559 vertexCount = triangleCount + 3 - 1;
5560 break;
5561 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
5562 triangleCount = primitiveCount;
5563 vertexCount = triangleCount * 3;
5564 break;
5565 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
5566 vertexCount = primitiveCount;
5567 break;
5568 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
5569 vertexCount = primitiveCount * patchSize;
5570 triangleCount = ROUNDUP(vertexCount, 3) / 3;
5571 break;
5572 default:
5573 DE_ASSERT(false);
5574 }
5575
5576 if (3 == vertexCount)
5577 {
5578 return {{-1.0f, +1.0f, 0.0f, 1.0f}, {0.0f, -1.0f, 0.0f, 1.0f}, {+1.0f, +1.0f, 0.0f, 1.0f}};
5579 }
5580
5581 std::vector<tcu::Vec4> vertices(vertexCount);
5582
5583 if (VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP == topology)
5584 {
5585 uint32_t v = 0;
5586 const uint32_t div = ROUNDUP(triangleCount, 2) / 2;
5587
5588 for (uint32_t i = 0; i < triangleCount && v < vertexCount; ++i)
5589 {
5590 const float xx = cast(float((i / 2) + 1) / float(div));
5591 if (0 == i)
5592 {
5593 const float x = cast(float(i / 2) / float(div));
5594 vertices[v++] = {x, +1.0f, 0.0f, 1.0f};
5595 vertices[v++] = {x, -1.0f, 0.0f, 1.0f};
5596 vertices[v++] = {xx, +1.0f, 0.0f, 1.0f};
5597 }
5598 else
5599 {
5600 if (i % 2)
5601 vertices[v++] = {xx, -1.0f, 0.0f, 1.0f};
5602 else
5603 vertices[v++] = {xx, +1.0f, 0.0f, 1.0f};
5604 }
5605 }
5606 DE_ASSERT(vertexCount == v);
5607 }
5608 else if (VK_PRIMITIVE_TOPOLOGY_POINT_LIST == topology)
5609 {
5610 uint32_t v = 0;
5611 const auto rect = bestRect(vertexCount);
5612
5613 float y = -1.0f;
5614 for (uint32_t h = 0; h < rect.second; ++h)
5615 {
5616 const float yy = cast(float(h + 1) / float(rect.second));
5617 float x = -1.0f;
5618 for (uint32_t w = 0; w < rect.first && v < vertexCount; ++w)
5619 {
5620 const float xx = cast(float(w + 1) / float(rect.first));
5621 vertices[v++] = {((xx - x) / 2.0f), ((yy - y) / 2.0f), 0.0f, 1.0f};
5622 x = xx;
5623 }
5624 y = yy;
5625 }
5626 DE_ASSERT(vertexCount == v);
5627 }
5628 else if (VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST == topology || VK_PRIMITIVE_TOPOLOGY_PATCH_LIST == topology)
5629 {
5630 uint32_t v = 0;
5631 const auto rect = bestRect(triangleCount);
5632
5633 float y = -1.0f;
5634 for (uint32_t h = 0; h < rect.second && v < vertexCount; ++h)
5635 {
5636 const float yy = cast(float(h + 1) / float(rect.second));
5637 float x = -1.0f;
5638 for (uint32_t w = 0; w < rect.first && v < vertexCount; ++w)
5639 {
5640 const float xx = cast(float(w + 1) / float(rect.first));
5641 if (v < vertexCount)
5642 vertices[v++] = {x, yy, 0.f, 0.f};
5643 if (v < vertexCount)
5644 vertices[v++] = {((xx + x) / 2.f), y, 0.f, 0.f};
5645 if (v < vertexCount)
5646 vertices[v++] = {xx, ((yy + y) / 2.f), 0.f, 0.f};
5647 x = xx;
5648 }
5649 y = yy;
5650 }
5651 DE_ASSERT(vertexCount == v);
5652 }
5653
5654 return vertices;
5655 }
5656
createVertexBufferAndFlush(const std::vector<tcu::Vec4> & vertices)5657 de::MovePtr<BufferWithMemory> ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(
5658 const std::vector<tcu::Vec4> &vertices)
5659 {
5660 const DeviceInterface &vk = m_context.getDeviceInterface();
5661 const VkDevice device = m_context.getDevice();
5662 Allocator &allocator = m_context.getDefaultAllocator();
5663 const VkDeviceSize bufferSize = VkDeviceSize(vertices.size()) * sizeof(tcu::Vec4);
5664 const VkBufferUsageFlags bufferUsage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
5665 const VkBufferCreateInfo createInfo = makeBufferCreateInfo(bufferSize, bufferUsage);
5666 const MemoryRequirement memoryReqs = (MemoryRequirement::HostVisible | MemoryRequirement::Coherent);
5667 de::MovePtr<BufferWithMemory> buffer(new BufferWithMemory(vk, device, allocator, createInfo, memoryReqs));
5668 Allocation &allocation = buffer->getAllocation();
5669 auto bufferRange = makeStdBeginEnd<tcu::Vec4>(allocation.getHostPtr(), (uint32_t)vertices.size());
5670 std::copy(vertices.begin(), vertices.end(), bufferRange.first);
5671 flushAlloc(vk, device, allocation);
5672 return buffer;
5673 }
5674
recordDrawingAndSubmit(const VkCommandBuffer cmdBuffer,const VkPipelineLayout pipelineLayout,const VkPipeline pipeline,const VkDescriptorSet descriptorSet,const PushConstant & pushConstant,const VkRenderPassBeginInfo & renderPassInfo,const VkBuffer vertexBuffer,const uint32_t vertexCount,const VkImage image)5675 void ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit(
5676 const VkCommandBuffer cmdBuffer, const VkPipelineLayout pipelineLayout, const VkPipeline pipeline,
5677 const VkDescriptorSet descriptorSet, const PushConstant &pushConstant, const VkRenderPassBeginInfo &renderPassInfo,
5678 const VkBuffer vertexBuffer, const uint32_t vertexCount, const VkImage image)
5679 {
5680 DE_UNREF(image);
5681 const DeviceInterface &vk = m_context.getDeviceInterface();
5682 const VkDevice device = m_context.getDevice();
5683 const VkQueue queue = m_context.getUniversalQueue();
5684 const VkPipelineBindPoint bindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
5685
5686 beginCommandBuffer(vk, cmdBuffer, 0u);
5687 vk.cmdBindDescriptorSets(cmdBuffer, bindPoint, pipelineLayout, 0u, 1u, &descriptorSet, 0u, DE_NULL);
5688 vk.cmdBindPipeline(cmdBuffer, bindPoint, pipeline);
5689 vk.cmdBindVertexBuffers(cmdBuffer, 0u, 1u, &static_cast<const VkBuffer &>(vertexBuffer),
5690 &static_cast<const VkDeviceSize &>(0u));
5691 vk.cmdPushConstants(cmdBuffer, pipelineLayout, m_data.shaderStage, 0, sizeof(PushConstant), &pushConstant);
5692 vk.cmdBeginRenderPass(cmdBuffer, &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE);
5693 vk.cmdDraw(cmdBuffer, vertexCount, 1u, 0u, 0u);
5694 vk.cmdEndRenderPass(cmdBuffer);
5695 endCommandBuffer(vk, cmdBuffer);
5696
5697 submitCommandsAndWait(vk, device, queue, cmdBuffer);
5698 }
5699
createShaders(void)5700 std::vector<Move<VkShaderModule>> ReconvergenceTestFragmentInstance::createShaders(void)
5701 {
5702 const DeviceInterface &vk = m_context.getDeviceInterface();
5703 const VkDevice device = m_context.getDevice();
5704
5705 Move<VkShaderModule> vertex = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"), 0);
5706 Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0);
5707
5708 // { #vert, #frag, tesc, tese, geom }; if any
5709 std::vector<Move<VkShaderModule>> shaders;
5710 shaders.emplace_back(vertex);
5711 shaders.emplace_back(fragment);
5712
5713 return shaders;
5714 }
5715
calculateAndLogResult(const uint64_t * result,const std::vector<uint64_t> & ref,uint32_t invocationStride,uint32_t subgroupSize,uint32_t shaderMaxLocs,uint32_t primitiveCount,PrintMode printMode)5716 qpTestResult_e ReconvergenceTestGraphicsInstance::calculateAndLogResult(const uint64_t *result,
5717 const std::vector<uint64_t> &ref,
5718 uint32_t invocationStride,
5719 uint32_t subgroupSize, uint32_t shaderMaxLocs,
5720 uint32_t primitiveCount, PrintMode printMode)
5721 {
5722 DE_ASSERT(m_data.testType == TT_MAXIMAL);
5723
5724 const uint32_t maxLoc = static_cast<uint32_t>(ref.size());
5725 tcu::TestLog &log = m_context.getTestContext().getLog();
5726 qpTestResult res = QP_TEST_RESULT_PASS;
5727 uint32_t mismatchCount = 0;
5728
5729 DE_ASSERT(shaderMaxLocs * invocationStride <= maxLoc);
5730
5731 // With maximal reconvergence, we should expect the output to exactly match
5732 // the reference.
5733 for (uint32_t i = 0; i < maxLoc; ++i)
5734 {
5735 const uint64_t resultVal = result[i];
5736 const uint64_t refVal = ref[i];
5737 if (resultVal != refVal)
5738 {
5739 if (1 > mismatchCount++)
5740 {
5741 log << tcu::TestLog::Message << mismatchCount << ": Mismatch at " << i
5742 << ", res: " << tcu::toHex(resultVal) << ", ref: " << tcu::toHex(refVal)
5743 << tcu::TestLog::EndMessage;
5744 }
5745 }
5746 }
5747
5748 if (PrintMode::None != printMode)
5749 {
5750 log << tcu::TestLog::Message << "deviceSubgroupSize: " << m_subgroupSize
5751 << ", testSubgroupSize: " << subgroupSize << ", invocationStride: " << invocationStride
5752 << ", shaderMaxLocs: " << shaderMaxLocs << "\n\t, framebuffer: " << m_data.sizeX << 'x' << m_data.sizeY
5753 << ", primitiveCount: " << primitiveCount << ", PRINT_MODE: "
5754 << ((PrintMode::ThreadsInColumns == printMode) ?
5755 "\"ouLocs in rows & threads in columns\"" :
5756 ((PrintMode::OutLocsInColumns == printMode) ? "\"threads in rows & outLocs in columns\"" : ""))
5757 << " { id:res,ref }\n"
5758 << tcu::TestLog::EndMessage;
5759 }
5760
5761 uint32_t invMax = std::min(invocationStride, 80u);
5762
5763 if (PrintMode::ThreadsInColumns == printMode)
5764 {
5765 for (uint32_t loc = 0; loc < shaderMaxLocs; ++loc)
5766 {
5767 auto l1 = log << tcu::TestLog::Message;
5768 l1 << "loc " << std::setw(3) << loc << ": ";
5769 for (uint32_t inv = 0; inv < invMax; ++inv)
5770 {
5771 uint32_t idx = loc * invocationStride + inv;
5772 DE_ASSERT(idx < maxLoc);
5773 uint64_t resEntry = result[idx];
5774 uint64_t refEntry = ref[idx];
5775 //l1 << de::toString(inv) << ':' << tcu::toHex(resEntry) << ',' << tcu::toHex(refEntry) << ' ';
5776 l1 << std::dec << inv << ':' << std::setw(subgroupSize / 4) << std::hex << resEntry << ','
5777 << std::setw(subgroupSize / 4) << std::hex << refEntry << std::dec << ' ';
5778 }
5779 l1 << std::setw(0) << tcu::TestLog::EndMessage;
5780 }
5781 }
5782 else if (PrintMode::OutLocsInColumns == printMode)
5783 {
5784 for (uint32_t inv = 0; inv < invMax; ++inv)
5785 {
5786 auto l1 = log << tcu::TestLog::Message;
5787 l1 << "res " << std::setw(3) << inv << ": ";
5788 for (uint32_t loc = 0; loc < shaderMaxLocs; ++loc)
5789 {
5790 uint32_t idx = loc * invocationStride + inv;
5791 DE_ASSERT(idx < maxLoc);
5792 uint64_t entry = result[idx];
5793 l1 << de::toString(loc) << ':' << tcu::toHex(entry) << ' ';
5794 }
5795 l1 << std::setw(0) << tcu::TestLog::EndMessage;
5796
5797 auto l2 = log << tcu::TestLog::Message;
5798 l2 << "ref " << std::setw(3) << inv << ": ";
5799 for (uint32_t loc = 0; loc < shaderMaxLocs; ++loc)
5800 {
5801 uint32_t idx = loc * invocationStride + inv;
5802 DE_ASSERT(idx < maxLoc);
5803 uint64_t entry = ref[idx];
5804 l2 << de::toString(loc) << ':' << tcu::toHex(entry) << ' ';
5805 }
5806 l2 << std::setw(0) << tcu::TestLog::EndMessage;
5807 }
5808 }
5809
5810 if (mismatchCount)
5811 {
5812 double mismatchPercentage = 0.0;
5813 std::modf((double)(mismatchCount * 100) / (double)maxLoc, &mismatchPercentage);
5814 log << tcu::TestLog::Message << "Mismatch count " << mismatchCount << " from " << maxLoc << " ("
5815 << mismatchPercentage << "%)" << tcu::TestLog::EndMessage;
5816 res = QP_TEST_RESULT_FAIL;
5817 }
5818
5819 if (res != QP_TEST_RESULT_PASS)
5820 {
5821 for (uint32_t i = 0; i < maxLoc; ++i)
5822 {
5823 // This log can be large and slow, ifdef it out by default
5824 #if 0
5825 log << tcu::TestLog::Message << "result " << i << "(" << (i / invocationStride) << ", " << (i % invocationStride) << "): " << tcu::toHex(result[i]) << " ref " << tcu::toHex(ref[i]) << (result[i] != ref[i] ? " different" : "") << tcu::TestLog::EndMessage;
5826 #endif
5827 }
5828 }
5829
5830 return res;
5831 }
5832
calculateAndLogResultEx(tcu::TestLog & log,const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,const uint32_t maxLoc,const Arrangement & a,const PrintMode printMode)5833 qpTestResult_e ReconvergenceTestFragmentInstance::calculateAndLogResultEx(tcu::TestLog &log, const tcu::UVec4 *result,
5834 const std::vector<tcu::UVec4> &ref,
5835 const uint32_t maxLoc, const Arrangement &a,
5836 const PrintMode printMode)
5837 {
5838 DE_UNREF(printMode);
5839
5840 qpTestResult res = QP_TEST_RESULT_PASS;
5841 uint32_t mismatchCount = 0u;
5842 const uint32_t printMismatchCount = 5u;
5843 const FragmentRandomProgram::Arrangement &aa = static_cast<const FragmentRandomProgram::Arrangement &>(a);
5844
5845 // With maximal reconvergence, we should expect the output to exactly match
5846 // the reference.
5847 const uint32_t ballotStoreCount = maxLoc * aa.m_invocationStride * aa.m_primitiveStride;
5848 for (uint32_t i = 0; i < ballotStoreCount; ++i)
5849 {
5850 const Ballot resultVal(result[i], aa.m_subgroupSize);
5851 ;
5852 const Ballot refVal(ref[i], aa.m_subgroupSize);
5853 if (resultVal != refVal)
5854 {
5855 if (mismatchCount++ < printMismatchCount)
5856 {
5857 res = QP_TEST_RESULT_FAIL;
5858 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
5859 << "\n got: " << refVal << tcu::TestLog::EndMessage;
5860 if (printMode == PrintMode::Console)
5861 {
5862 std::cout << "Mismatch at " << i << "\nexpected: " << resultVal << "\n got: " << refVal
5863 << std::endl;
5864 }
5865 }
5866 }
5867 }
5868
5869 log << tcu::TestLog::Message << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount
5870 << tcu::TestLog::EndMessage;
5871 if (printMode == PrintMode::Console)
5872 {
5873 std::cout << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount << std::endl;
5874 }
5875
5876 return res;
5877 }
5878
makeImageCreateInfo(VkFormat format) const5879 VkImageCreateInfo ReconvergenceTestFragmentInstance::makeImageCreateInfo(VkFormat format) const
5880 {
5881 return {
5882 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
5883 nullptr, // const void* pNext;
5884 VkImageCreateFlags(0), // VkImageCreateFlags flags;
5885 VK_IMAGE_TYPE_2D, // VkImageType imageType;
5886 format, // VkFormat format;
5887 {m_data.sizeX, m_data.sizeY, 1u}, // VkExtent3D extent;
5888 1u, // uint32_t mipLevels;
5889 1u, // uint32_t arrayLayers;
5890 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
5891 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
5892 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
5893 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
5894 0u, // uint32_t queueFamilyIndexCount;
5895 0u, // const uint32_t* pQueueFamilyIndices;
5896 VK_IMAGE_LAYOUT_UNDEFINED // VkImageLayout initialLayout;
5897 };
5898 }
5899
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)5900 de::MovePtr<BufferWithMemory> ReconvergenceTestFragmentInstance::createVertexBufferAndFlush(
5901 uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
5902 {
5903 // DE_ASSERT(cellsHorz == 2u);
5904 // DE_ASSERT((cellsHorz * 3) == cellsVert);
5905 DE_UNREF(cellsHorz);
5906 DE_UNREF(cellsVert);
5907 DE_UNREF(topology);
5908 DE_ASSERT(topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST);
5909 const std::vector<tcu::Vec4> vertices{{-1.0f, 0.0f, 0.0f, 0.0f}, {-0.5f, -1.0f, 0.0f, 0.0f},
5910 {+1.0f, +1.0f, 0.0f, 0.0f}, {+0.5f, -1.0f, 0.0f, 0.0f},
5911 {+1.0f, 0.0f, 0.0f, 0.0f}, {-1.0f, +1.0f, 0.0f, 0.0f}};
5912 return ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(vertices);
5913 }
5914
callAuxiliaryShader(tcu::TestStatus & status,uint32_t triangleCount)5915 std::vector<uint32_t> ReconvergenceTestFragmentInstance::callAuxiliaryShader(tcu::TestStatus &status,
5916 uint32_t triangleCount)
5917 {
5918 const DeviceInterface &vk = m_context.getDeviceInterface();
5919 const VkDevice device = m_context.getDevice();
5920 add_ref<Allocator> allocator = m_context.getDefaultAllocator();
5921 const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
5922 //add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
5923 const uint32_t bufferElems = m_data.sizeX * m_data.sizeY * triangleCount + 3u;
5924 const VkDeviceSize bufferSize = bufferElems * sizeof(uint32_t);
5925
5926 if (bufferSize > m_context.getDeviceProperties().limits.maxStorageBufferRange)
5927 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
5928
5929 const VkBufferCreateInfo createInfo =
5930 vk::makeBufferCreateInfo(bufferSize, (VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
5931 VK_BUFFER_USAGE_TRANSFER_SRC_BIT));
5932 de::MovePtr<BufferWithMemory> buffer;
5933 try
5934 {
5935 buffer = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
5936 vk, device, allocator, createInfo, (MemoryRequirement::HostVisible | MemoryRequirement::Coherent)));
5937 }
5938 catch (tcu::ResourceError &)
5939 {
5940 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
5941 status = tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
5942 "Failed device memory allocation " + de::toString(bufferSize) + " bytes");
5943 return {};
5944 }
5945
5946 const VkDescriptorBufferInfo bufferInfo = makeDescriptorBufferInfo(**buffer, 0, bufferSize);
5947
5948 vk::DescriptorSetLayoutBuilder layoutBuilder;
5949 layoutBuilder.addSingleBinding(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_SHADER_STAGE_FRAGMENT_BIT);
5950 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
5951
5952 vk::DescriptorPoolBuilder poolBuilder;
5953 poolBuilder.addType(vk::VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1u);
5954 vk::Unique<vk::VkDescriptorPool> descriptorPool(
5955 poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
5956
5957 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
5958
5959 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
5960 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(0),
5961 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferInfo);
5962 setUpdateBuilder.update(vk, device);
5963
5964 const VkPushConstantRange pushConstantRange{
5965 VK_SHADER_STAGE_FRAGMENT_BIT, // VkShaderStageFlags stageFlags;
5966 0u, // uint32_t offset;
5967 sizeof(PushConstant) // uint32_t size;
5968 };
5969 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
5970 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
5971 DE_NULL, // pNext
5972 (VkPipelineLayoutCreateFlags)0, // flags
5973 1u, // setLayoutCount
5974 &descriptorSetLayout.get(), // pSetLayouts
5975 1u, // pushConstantRangeCount
5976 &pushConstantRange, // pPushConstantRanges
5977 };
5978
5979 const VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
5980 const VkImageCreateInfo imageCreateInfo = makeImageCreateInfo(format);
5981 const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
5982 de::MovePtr<ImageWithMemory> image(
5983 new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
5984 Move<VkImageView> view = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
5985 Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
5986 Move<VkFramebuffer> framebuffer =
5987 makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
5988 const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
5989 auto createAuxShaders = [&]()
5990 {
5991 Shaders shaders;
5992 auto vert = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"), 0);
5993 auto frag = createShaderModule(vk, device, m_context.getBinaryCollection().get("aux"), 0);
5994 shaders.emplace_back(vert);
5995 shaders.emplace_back(frag);
5996 return shaders;
5997 };
5998 const Shaders shaders = createAuxShaders();
5999 const uint32_t vertexCount = triangleCount * 3u;
6000 de::MovePtr<BufferWithMemory> vertexBuffer =
6001 createVertexBufferAndFlush(triangleCount, vertexCount, VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST);
6002 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6003 Move<VkPipeline> pipeline = createGraphicsPipeline(*pipelineLayout, *renderPass, m_data.sizeX, m_data.sizeY,
6004 shaders, VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, 0U);
6005 Move<VkCommandPool> cmdPool =
6006 createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6007 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6008
6009 PushConstant pc{};
6010 pc.invocationStride = 0u;
6011 pc.width = m_data.sizeX;
6012 pc.height = m_data.sizeY;
6013 pc.primitiveStride = triangleCount;
6014
6015 void *ptr = buffer->getAllocation().getHostPtr();
6016 auto bufferRange = makeStdBeginEnd<uint32_t>(ptr, bufferElems);
6017 std::fill(bufferRange.first, bufferRange.second, 0u);
6018
6019 std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this, *cmdBuffer, *pipelineLayout, *pipeline,
6020 *descriptorSet, std::cref(pc), std::cref(renderBeginInfo), **vertexBuffer, vertexCount, **image)();
6021
6022 status = tcu::TestStatus::pass(std::string());
6023 return std::vector<uint32_t>(bufferRange.first, bufferRange.second);
6024 }
6025
iterate(void)6026 tcu::TestStatus ReconvergenceTestFragmentInstance::iterate(void)
6027 {
6028 const DeviceInterface &vk = m_context.getDeviceInterface();
6029 const VkDevice device = m_context.getDevice();
6030 add_ref<Allocator> allocator = m_context.getDefaultAllocator();
6031 const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
6032 add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
6033 const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
6034 const uint32_t fragmentStride = m_data.sizeX * m_data.sizeY;
6035 const uint32_t primitiveStride = 2;
6036
6037 if (sizeof(PushConstant) > limits.maxPushConstantsSize)
6038 {
6039 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6040 "PushConstant size " + std::to_string(sizeof(PushConstant)) + " exceeds device limit " +
6041 std::to_string(limits.maxPushConstantsSize));
6042 }
6043
6044 tcu::TestStatus auxStatus(QP_TEST_RESULT_FAIL, std::string());
6045 std::vector<uint32_t> primitiveMap = callAuxiliaryShader(auxStatus, primitiveStride);
6046 if (auxStatus.isFail())
6047 return auxStatus;
6048
6049 const uint32_t shaderSubgroupSize = primitiveMap.at(fragmentStride * primitiveStride + 1u);
6050 if (shaderSubgroupSize != m_subgroupSize)
6051 {
6052 return tcu::TestStatus(QP_TEST_RESULT_FAIL,
6053 "The size of the subgroup from the shader (" + std::to_string(shaderSubgroupSize) +
6054 ") is different from the size of the subgroup from the device (" +
6055 std::to_string(m_subgroupSize) + ")");
6056 }
6057 const uint32_t shaderSubgroupStride = primitiveMap.at(fragmentStride * primitiveStride + 0u);
6058 const uint32_t hostSubgroupStride =
6059 FragmentRandomProgram::Arrangement::calcSubgroupCount(primitiveMap, primitiveStride, fragmentStride);
6060 if (shaderSubgroupStride != hostSubgroupStride)
6061 {
6062 return tcu::TestStatus(QP_TEST_RESULT_FAIL,
6063 "The number of subgroups from the shader (" + std::to_string(shaderSubgroupStride) +
6064 ") is different from the number of subgroups calculated manually (" +
6065 std::to_string(hostSubgroupStride) + ")");
6066 }
6067
6068 log << tcu::TestLog::Message << "Subgroup count: " << hostSubgroupStride << tcu::TestLog::EndMessage;
6069 log << tcu::TestLog::Message << "Subgroup size: " << m_subgroupSize << tcu::TestLog::EndMessage;
6070
6071 const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
6072 de::MovePtr<BufferWithMemory> vertexBuffer =
6073 createVertexBufferAndFlush(primitiveStride, (primitiveStride * 3u), topology);
6074
6075 std::vector<tcu::UVec4> ref;
6076 de::MovePtr<FragmentRandomProgram> program = FragmentRandomProgram::create(m_data);
6077 program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
6078
6079 const uint32_t simulationMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), true, m_subgroupSize,
6080 fragmentStride, primitiveStride, ref, log, primitiveMap);
6081 log << tcu::TestLog::Message << "simulated maxLoc: " << simulationMaxLoc << tcu::TestLog::EndMessage;
6082 // maxLoc is per-invocation. Add one (to make sure no additional writes are done)
6083 uint32_t maxLoc = simulationMaxLoc;
6084 maxLoc += 1;
6085 maxLoc *= (hostSubgroupStride * 128u * primitiveStride);
6086
6087 constexpr uint32_t bufferCount = 4;
6088 enum Bindings
6089 {
6090 InputA,
6091 OutputBallots,
6092 OutputCounts,
6093 OutputPriMap
6094 };
6095
6096 de::MovePtr<BufferWithMemory> buffers[bufferCount];
6097 vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
6098
6099 VkDeviceSize sizes[bufferCount]{
6100 // InputA { uint a[]; } inputA; filled with a[i] := i
6101 (FragmentRandomProgram::conditionIfInvocationStride + 2) * sizeof(uint32_t),
6102
6103 // OutputB { uvec4 b[]; } outputB;
6104 maxLoc * sizeof(tcu::UVec4),
6105
6106 // OutputC { uint loc[]; } outputC;
6107 (hostSubgroupStride * 128u * primitiveStride) * sizeof(uint32_t),
6108
6109 // OutputP { uvec p[]; } outputP; few more for calculating subgroupID, subgroupSize, non-helper and helperinvocations
6110 (fragmentStride * primitiveStride + 16u) * sizeof(uint32_t)};
6111
6112 VkBufferUsageFlags usages[bufferCount]{
6113 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6114 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6115 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6116 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6117 };
6118
6119 // allocate buffers
6120 for (uint32_t i = 0; i < bufferCount; ++i)
6121 {
6122 if (sizes[i] > limits.maxStorageBufferRange)
6123 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6124
6125 try
6126 {
6127 buffers[i] = de::MovePtr<BufferWithMemory>(
6128 new BufferWithMemory(vk, device, allocator,
6129 makeBufferCreateInfo(sizes[i], usages[i] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6130 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6131 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6132 }
6133 catch (tcu::ResourceError &)
6134 {
6135 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6136 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6137 "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
6138 }
6139 bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
6140 }
6141
6142 // get raw pointers to previously allocated buffers
6143 void *ptrs[bufferCount];
6144 for (uint32_t i = 0; i < bufferCount; ++i)
6145 {
6146 ptrs[i] = buffers[i]->getAllocation().getHostPtr();
6147 }
6148
6149 // populate buffers with their destination
6150 {
6151 auto rangeBufferA =
6152 makeStdBeginEnd<uint32_t>(ptrs[InputA], static_cast<uint32_t>(sizes[InputA] / sizeof(uint32_t)));
6153 std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
6154 }
6155 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6156 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6157 deMemset(ptrs[OutputPriMap], 0, (size_t)sizes[OutputPriMap]);
6158
6159 // (...) and flush them to the GPU
6160 for (uint32_t i = 0; i < bufferCount; ++i)
6161 {
6162 flushAlloc(vk, device, buffers[i]->getAllocation());
6163 }
6164
6165 VkDescriptorType descTypes[bufferCount]{
6166 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6167 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6168 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6169 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6170 };
6171
6172 vk::DescriptorSetLayoutBuilder layoutBuilder;
6173 for (uint32_t i = 0; i < bufferCount; ++i)
6174 {
6175 layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
6176 }
6177 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
6178
6179 vk::DescriptorPoolBuilder poolBuilder;
6180 for (uint32_t i = 0; i < bufferCount; ++i)
6181 {
6182 poolBuilder.addType(descTypes[i], 1);
6183 }
6184 vk::Unique<vk::VkDescriptorPool> descriptorPool(
6185 poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
6186 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
6187
6188 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
6189 for (uint32_t i = 0; i < bufferCount; ++i)
6190 {
6191 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
6192 &bufferDescriptors[i]);
6193 }
6194 setUpdateBuilder.update(vk, device);
6195
6196 const VkPushConstantRange pushConstantRange{
6197 (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
6198 0u, // uint32_t offset;
6199 sizeof(PushConstant) // uint32_t size;
6200 };
6201
6202 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
6203 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
6204 DE_NULL, // pNext
6205 (VkPipelineLayoutCreateFlags)0, // flags
6206 1u, // setLayoutCount
6207 &descriptorSetLayout.get(), // pSetLayouts
6208 1u, // pushConstantRangeCount
6209 &pushConstantRange, // pPushConstantRanges
6210 };
6211
6212 const VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
6213 const VkImageCreateInfo imageCreateInfo = makeImageCreateInfo(format);
6214 const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
6215 de::MovePtr<ImageWithMemory> image(
6216 new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
6217 Move<VkImageView> view = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
6218 Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
6219 Move<VkFramebuffer> framebuffer =
6220 makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
6221 const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
6222 const Shaders shaders = createShaders();
6223 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6224 Move<VkPipeline> pipeline =
6225 createGraphicsPipeline(*pipelineLayout, *renderPass, m_data.sizeX, m_data.sizeY, shaders, topology, 0U);
6226 Move<VkCommandPool> cmdPool =
6227 createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6228 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6229
6230 PushConstant pc{};
6231 pc.width = m_data.sizeX;
6232 pc.height = m_data.sizeY;
6233 pc.primitiveStride = primitiveStride;
6234 pc.invocationStride = 0u;
6235 pc.subgroupStride = hostSubgroupStride;
6236 pc.enableInvocationIndex = VK_FALSE;
6237
6238 auto callRecordDrawingAndSubmit = std::bind(
6239 &ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this, *cmdBuffer, *pipelineLayout, *pipeline,
6240 *descriptorSet, std::cref(pc), std::cref(renderBeginInfo), **vertexBuffer, (primitiveStride * 3u), **image);
6241
6242 // compute "maxLoc", which is a potential maximum number of locations written
6243 callRecordDrawingAndSubmit();
6244
6245 // Take the maximum of "maxLoc" over all invocations.
6246 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6247 auto rangeLoc = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], (hostSubgroupStride * 128u * primitiveStride));
6248 const uint32_t computedShaderMaxLoc = *max_element(rangeLoc.first, rangeLoc.second);
6249 log << tcu::TestLog::Message << "Computed maxLoc in the shader: " << computedShaderMaxLoc
6250 << tcu::TestLog::EndMessage;
6251
6252 if (computedShaderMaxLoc >= FragmentRandomProgram::experimentalOutLocSize)
6253 {
6254 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6255 "Calculated maxLoc from a shader (which is " + de::toString(computedShaderMaxLoc) +
6256 ") "
6257 "exceeds BALLOT_STACK_SIZE (which is " +
6258 de::toString(FragmentRandomProgram::experimentalOutLocSize) +
6259 ").\n"
6260 "To repair this just increment slightly a " MAKETEXT(
6261 FragmentRandomProgram::experimentalOutLocSize) " "
6262 "in line " +
6263 de::toString(BALLOT_STACK_SIZE_DEFVAL_LINE));
6264 }
6265
6266 // If we need more space, reallocate OutputB::b[]
6267 if (computedShaderMaxLoc != simulationMaxLoc)
6268 {
6269 // Add one (to make sure no additional writes are done) and multiply by
6270 // the number of invocations and current primitive count
6271 maxLoc = (std::max(computedShaderMaxLoc, simulationMaxLoc) + 1) * (hostSubgroupStride * 128u * primitiveStride);
6272 sizes[OutputBallots] = maxLoc * sizeof(tcu::UVec4);
6273
6274 if (sizes[OutputBallots] > limits.maxStorageBufferRange)
6275 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6276
6277 try
6278 {
6279 buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
6280 vk, device, allocator,
6281 makeBufferCreateInfo(sizes[OutputBallots], usages[OutputBallots] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6282 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6283 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6284 }
6285 catch (tcu::ResourceError &)
6286 {
6287 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6288 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6289 "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6290 }
6291 bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
6292 ptrs[OutputBallots] = buffers[OutputBallots]->getAllocation().getHostPtr();
6293
6294 vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
6295 setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
6296 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
6297 setUpdateBuilder2.update(vk, device);
6298 }
6299
6300 // Clear any writes to ballots/stores OutputB::b[] aka buffer[OutputBallots] during the counting pass
6301 // Note that its size would may change since the first memory allocation
6302 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6303 // Clear any writes to counting OutputC::loc[] aka buffer[OutputCounts] during the counting pass
6304 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6305 // Clear any writes to counting OutputP::p[] aka buffer[OutputPriMap] during the counting pass
6306 deMemset(ptrs[OutputPriMap], 0, (size_t)sizes[OutputPriMap]);
6307
6308 // flush them all to the GPU
6309 flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6310 flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6311 flushAlloc(vk, device, buffers[OutputPriMap]->getAllocation());
6312
6313 // run the actual shader with updated PushConstant
6314 pc.enableInvocationIndex = VK_TRUE;
6315 callRecordDrawingAndSubmit();
6316
6317 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6318 invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6319 invalidateAlloc(vk, device, buffers[OutputPriMap]->getAllocation());
6320
6321 // Simulate execution on the CPU, and compare against the GPU result
6322 try
6323 {
6324 ref.resize(maxLoc, tcu::UVec4());
6325 }
6326 catch (const std::bad_alloc &)
6327 {
6328 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6329 return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
6330 "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
6331 }
6332
6333 std::fill(primitiveMap.begin(), primitiveMap.end(), 0);
6334 auto primitiveMapRange = makeStdBeginEnd<const uint32_t>(ptrs[OutputPriMap], (fragmentStride * primitiveStride));
6335 std::copy(primitiveMapRange.first, primitiveMapRange.second, primitiveMap.begin());
6336
6337 const FragmentRandomProgram::Arrangement a(primitiveMap, m_data.sizeX, m_data.sizeY, m_subgroupSize,
6338 primitiveStride);
6339 const tcu::UVec4 *ballots = static_cast<tcu::UVec4 *>(ptrs[OutputBallots]);
6340
6341 program->execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize, fragmentStride, primitiveStride,
6342 ref, log, primitiveMap, ballots);
6343
6344 const uint32_t finalMaxLoc = std::max(computedShaderMaxLoc, simulationMaxLoc);
6345 const qpTestResult res = calculateAndLogResultEx(log, ballots, ref, finalMaxLoc, a, PrintMode::None);
6346
6347 return tcu::TestStatus(res, qpGetTestResultName(res));
6348 }
6349
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)6350 de::MovePtr<BufferWithMemory> ReconvergenceTestVertexInstance::createVertexBufferAndFlush(uint32_t cellsHorz,
6351 uint32_t cellsVert,
6352 VkPrimitiveTopology topology)
6353 {
6354 DE_UNREF(topology);
6355 DE_ASSERT(VK_PRIMITIVE_TOPOLOGY_POINT_LIST == topology);
6356 const std::vector<tcu::Vec4> vertices =
6357 VertexRandomProgram::Arrangement::generatePrimitives(cellsHorz, cellsVert, VertexRandomProgram::fillPercentage);
6358 return ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(vertices);
6359 }
6360
createShaders(void)6361 std::vector<Move<VkShaderModule>> ReconvergenceTestVertexInstance::createShaders(void)
6362 {
6363 const DeviceInterface &vk = m_context.getDeviceInterface();
6364 const VkDevice device = m_context.getDevice();
6365
6366 Move<VkShaderModule> vertex = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"), 0);
6367 Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"), 0);
6368
6369 // { #vert, #frag, #tesc, tese, geom }; if any
6370 std::vector<Move<VkShaderModule>> shaders;
6371 shaders.emplace_back(vertex);
6372 shaders.emplace_back(fragment);
6373
6374 return shaders;
6375 }
6376
iterate(void)6377 tcu::TestStatus ReconvergenceTestVertexInstance::iterate(void)
6378 {
6379 const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
6380 if (sizeof(PushConstant) > limits.maxPushConstantsSize)
6381 {
6382 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6383 "PushConstant size " + std::to_string(sizeof(PushConstant)) + " exceeds device limit " +
6384 std::to_string(limits.maxPushConstantsSize));
6385 }
6386
6387 const DeviceInterface &vk = m_context.getDeviceInterface();
6388 const VkDevice device = m_context.getDevice();
6389 Allocator &allocator = m_context.getDefaultAllocator();
6390 const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
6391 add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
6392 const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
6393 const uint32_t fragmentStride = uint32_t(m_data.sizeX * m_data.sizeY);
6394 const uint32_t invocationStride =
6395 static_cast<uint32_t>(VertexRandomProgram::Arrangement::generatePrimitives(m_data.sizeX, m_data.sizeY,
6396 VertexRandomProgram::fillPercentage)
6397 .size());
6398
6399 de::MovePtr<VertexRandomProgram> program(new VertexRandomProgram(m_data));
6400 program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
6401
6402 // simulate content of outputP buffer
6403 std::vector<uint32_t> outputP =
6404 VertexRandomProgram::Arrangement::generateOutputPvector(m_subgroupSize, invocationStride);
6405
6406 std::vector<tcu::UVec4> ref;
6407 const uint32_t hostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), true, m_subgroupSize,
6408 fragmentStride, invocationStride, ref, log, outputP, nullptr);
6409 log << tcu::TestLog::Message << "Rendering area : " << tcu::UVec2(m_data.sizeX, m_data.sizeY)
6410 << tcu::TestLog::EndMessage;
6411 log << tcu::TestLog::Message << "invocationStride: " << invocationStride << tcu::TestLog::EndMessage;
6412 log << tcu::TestLog::Message << "Simulated maxLoc: " << hostMaxLoc << tcu::TestLog::EndMessage;
6413 // maxLoc is per-invocation. Add one (to make sure no additional writes are done).
6414 uint32_t maxLoc = hostMaxLoc;
6415 maxLoc += 1;
6416 maxLoc *= invocationStride;
6417
6418 constexpr uint32_t bufferCount = 4u;
6419 enum Bindings
6420 {
6421 InputA,
6422 OutputBallots,
6423 OutputCounts,
6424 OutputPrimitives
6425 };
6426
6427 de::MovePtr<BufferWithMemory> buffers[bufferCount];
6428 vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
6429
6430 uint32_t counts[bufferCount]{// InputA { uint a[]; } inputA;
6431 uint32_t(m_data.sizeX * m_data.sizeY),
6432 // OutputB { uvec2 b[]; } outputB;
6433 maxLoc,
6434 // OutputC { uint loc[]; } outputC;
6435 invocationStride,
6436 // OutputP { uint p[]; } outputP;
6437 uint32_t(outputP.size())};
6438
6439 VkDeviceSize sizes[bufferCount]{// InputA { uint a[]; } inputA;
6440 counts[InputA] * sizeof(uint32_t),
6441 // OutputB { uvec2 b[]; } outputB;
6442 counts[OutputBallots] * sizeof(tcu::UVec4),
6443 // OutputC { uint loc[]; } outputC;
6444 counts[OutputCounts] * sizeof(uint32_t),
6445 // OutputP { uint p[]; } outputP;
6446 counts[OutputPrimitives] * sizeof(uint32_t)};
6447
6448 const VkBufferUsageFlags cmnUsages = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
6449 VkBufferUsageFlags usages[bufferCount]{
6450 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6451 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6452 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6453 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6454 };
6455
6456 // allocate buffers
6457 for (uint32_t i = 0; i < bufferCount; ++i)
6458 {
6459 if (sizes[i] > limits.maxStorageBufferRange)
6460 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6461
6462 try
6463 {
6464 buffers[i] = de::MovePtr<BufferWithMemory>(
6465 new BufferWithMemory(vk, device, allocator, makeBufferCreateInfo(sizes[i], usages[i] | cmnUsages),
6466 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6467 }
6468 catch (tcu::ResourceError &)
6469 {
6470 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6471 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6472 "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
6473 }
6474 bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
6475 }
6476
6477 // get raw pointers to previously allocated buffers
6478 void *ptrs[bufferCount];
6479 for (uint32_t i = 0; i < bufferCount; ++i)
6480 {
6481 ptrs[i] = buffers[i]->getAllocation().getHostPtr();
6482 }
6483
6484 // populate buffers with their destination
6485 {
6486 auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], counts[InputA]);
6487 std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
6488 }
6489 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6490 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6491 deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
6492
6493 // (...) and flush them to the GPU
6494 for (uint32_t i = 0; i < bufferCount; ++i)
6495 {
6496 flushAlloc(vk, device, buffers[i]->getAllocation());
6497 }
6498
6499 VkDescriptorType descTypes[bufferCount]{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6500 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER};
6501
6502 vk::DescriptorSetLayoutBuilder layoutBuilder;
6503 for (uint32_t i = 0; i < bufferCount; ++i)
6504 {
6505 layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
6506 }
6507 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
6508
6509 vk::DescriptorPoolBuilder poolBuilder;
6510 for (uint32_t i = 0; i < bufferCount; ++i)
6511 {
6512 poolBuilder.addType(descTypes[i], 1);
6513 }
6514 vk::Unique<vk::VkDescriptorPool> descriptorPool(
6515 poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
6516 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
6517
6518 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
6519 for (uint32_t i = 0; i < bufferCount; ++i)
6520 {
6521 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
6522 &bufferDescriptors[i]);
6523 }
6524 setUpdateBuilder.update(vk, device);
6525
6526 const VkPushConstantRange pushConstantRange{
6527 (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
6528 0u, // uint32_t offset;
6529 sizeof(PushConstant) // uint32_t size;
6530 };
6531
6532 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
6533 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
6534 DE_NULL, // pNext
6535 (VkPipelineLayoutCreateFlags)0, // flags
6536 1u, // setLayoutCount
6537 &descriptorSetLayout.get(), // pSetLayouts
6538 1u, // pushConstantRangeCount
6539 &pushConstantRange, // pPushConstantRanges
6540 };
6541
6542 const uint32_t imageWidth = m_data.sizeX;
6543 const uint32_t imageHeight = m_data.sizeY;
6544 const VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
6545 const VkImageCreateInfo imageCreateInfo{
6546 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
6547 nullptr, // const void* pNext;
6548 VkImageCreateFlags(0), // VkImageCreateFlags flags;
6549 VK_IMAGE_TYPE_2D, // VkImageType imageType;
6550 format, // VkFormat format;
6551 {imageWidth, imageHeight, 1u}, // VkExtent3D extent;
6552 1u, // uint32_t mipLevels;
6553 1u, // uint32_t arrayLayers;
6554 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
6555 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
6556 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
6557 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
6558 0u, // uint32_t queueFamilyIndexCount;
6559 0u, // const uint32_t* pQueueFamilyIndices;
6560 VK_IMAGE_LAYOUT_UNDEFINED // VkImageLayout initialLayout;
6561 };
6562 const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
6563 de::MovePtr<ImageWithMemory> image(
6564 new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
6565 Move<VkImageView> view = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
6566 Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
6567 Move<VkFramebuffer> framebuffer =
6568 makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
6569 de::MovePtr<BufferWithMemory> vertexBuffer = createVertexBufferAndFlush(m_data.sizeX, m_data.sizeY, topology);
6570 const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
6571 const Shaders shaders = createShaders();
6572 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6573 Move<VkPipeline> pipeline =
6574 createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders, topology, 0u);
6575 Move<VkCommandPool> cmdPool =
6576 createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6577 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6578
6579 PushConstant pc{};
6580 pc.invocationStride = invocationStride;
6581 pc.width = m_data.sizeX;
6582 pc.height = m_data.sizeY;
6583 pc.enableInvocationIndex = VK_FALSE;
6584
6585 auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
6586 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
6587 std::cref(renderBeginInfo), **vertexBuffer, invocationStride, **image);
6588
6589 // compute "maxLoc", which is a potential maximum number of locations written
6590 callRecordDrawingAndSubmit();
6591
6592 // Take the maximum of "maxLoc" over all invocations.
6593 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6594 auto rangeLoc = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], counts[OutputCounts]);
6595 const uint32_t shaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
6596 log << tcu::TestLog::Message << "Computed maxLoc in shader: " << shaderMaxLoc << tcu::TestLog::EndMessage;
6597
6598 // If we need more space, reallocate OutputB::b[] aka buffers[1]
6599 if (shaderMaxLoc != hostMaxLoc)
6600 {
6601 // Add one (to make sure no additional writes are done) and multiply by
6602 // the number of invocations and current primitive count
6603 maxLoc = (std::max(shaderMaxLoc, hostMaxLoc) + 1u) * invocationStride;
6604 counts[OutputBallots] = maxLoc;
6605 sizes[OutputBallots] = counts[OutputBallots] * sizeof(tcu::UVec4);
6606
6607 if (sizes[OutputBallots] > limits.maxStorageBufferRange)
6608 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6609
6610 try
6611 {
6612 buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
6613 vk, device, allocator, makeBufferCreateInfo(sizes[OutputBallots], usages[OutputBallots] | cmnUsages),
6614 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6615 }
6616 catch (tcu::ResourceError &)
6617 {
6618 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6619 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6620 "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6621 }
6622 bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
6623 ptrs[OutputBallots] = buffers[OutputBallots]->getAllocation().getHostPtr();
6624
6625 vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
6626 setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
6627 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
6628 setUpdateBuilder2.update(vk, device);
6629 }
6630
6631 // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
6632 // Note that its size would may change since the first memory allocation
6633 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6634 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6635 deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
6636
6637 // flush them all to the GPU
6638 flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6639 flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6640 flushAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
6641
6642 // run the actual shader with updated PushConstant
6643 pc.enableInvocationIndex = VK_TRUE;
6644 callRecordDrawingAndSubmit();
6645
6646 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6647 const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
6648 log << tcu::TestLog::Message << "Final maxLoc from shader: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
6649 if (finalShaderMaxLoc != shaderMaxLoc)
6650 {
6651 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6652 "maxLoc differs across shader invocations, expected: " + de::toString(shaderMaxLoc) +
6653 " got: " + de::toString(finalShaderMaxLoc));
6654 }
6655
6656 invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
6657 const tcu::UVec4 *ballots = static_cast<tcu::UVec4 *>(ptrs[OutputBallots]);
6658
6659 invalidateAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
6660 auto outputPrange = makeStdBeginEnd<uint32_t>(ptrs[OutputPrimitives], counts[OutputPrimitives]);
6661 std::copy(outputPrange.first, outputPrange.second, outputP.begin());
6662
6663 try
6664 {
6665 ref.resize(counts[OutputBallots], tcu::UVec4(0u, 0u, 0u, 0u));
6666 }
6667 catch (const std::bad_alloc &)
6668 {
6669 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6670 return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
6671 "Failed system memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6672 }
6673
6674 // Simulate execution on the CPU, and compare against the GPU result
6675 const uint32_t finalHostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize,
6676 fragmentStride, invocationStride, ref, log, outputP, ballots);
6677
6678 const qpTestResult res = calculateAndLogResultEx(log, ballots, ref, finalHostMaxLoc, PrintMode::None);
6679
6680 return tcu::TestStatus(res, qpGetTestResultName(res));
6681 }
6682
calculateAndLogResultEx(add_ref<tcu::TestLog> log,const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,const uint32_t maxLoc,const PrintMode printMode)6683 qpTestResult_e ReconvergenceTestVertexInstance::calculateAndLogResultEx(add_ref<tcu::TestLog> log,
6684 const tcu::UVec4 *result,
6685 const std::vector<tcu::UVec4> &ref,
6686 const uint32_t maxLoc,
6687 const PrintMode printMode)
6688 {
6689 DE_UNREF(maxLoc);
6690 DE_UNREF(printMode);
6691
6692 qpTestResult res = QP_TEST_RESULT_PASS;
6693 uint32_t mismatchCount = 0u;
6694 const uint32_t printMismatchCount = 5u;
6695
6696 // With maximal reconvergence, we should expect the output to exactly match the reference.
6697 const uint32_t ballotStoreCount = static_cast<uint32_t>(ref.size());
6698 for (uint32_t i = 0; i < ballotStoreCount; ++i)
6699 {
6700 const Ballot resultVal(result[i], m_subgroupSize);
6701 const Ballot refVal(ref.at(i), m_subgroupSize);
6702 if (resultVal != refVal)
6703 {
6704 if (mismatchCount++ < printMismatchCount)
6705 {
6706 res = QP_TEST_RESULT_FAIL;
6707 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
6708 << "\n got: " << refVal << tcu::TestLog::EndMessage;
6709 if (printMode == PrintMode::Console)
6710 {
6711 std::cout << "Mismatch at " << i << "\nexpected: " << resultVal << "\n got: " << refVal
6712 << std::endl;
6713 }
6714 }
6715 }
6716 }
6717
6718 log << tcu::TestLog::Message << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount
6719 << tcu::TestLog::EndMessage;
6720 if (printMode == PrintMode::Console)
6721 {
6722 std::cout << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount << std::endl;
6723 }
6724
6725 return res;
6726 }
6727
createShaders(void)6728 std::vector<Move<VkShaderModule>> ReconvergenceTestTessCtrlInstance::createShaders(void)
6729 {
6730 const DeviceInterface &vk = m_context.getDeviceInterface();
6731 const VkDevice device = m_context.getDevice();
6732
6733 Move<VkShaderModule> vertex = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"));
6734 Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"));
6735 Move<VkShaderModule> control = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"));
6736 Move<VkShaderModule> evaluation = createShaderModule(vk, device, m_context.getBinaryCollection().get("tese"));
6737
6738 // { #vert, #frag, #tesc, #tese, geom }; if any
6739 std::vector<Move<VkShaderModule>> shaders;
6740 shaders.emplace_back(vertex);
6741 shaders.emplace_back(fragment);
6742 shaders.emplace_back(control);
6743 shaders.emplace_back(evaluation);
6744
6745 return shaders;
6746 }
6747
iterate(void)6748 tcu::TestStatus ReconvergenceTestTessCtrlInstance::iterate(void)
6749 {
6750 const DeviceInterface &vk = m_context.getDeviceInterface();
6751 const VkDevice device = m_context.getDevice();
6752 Allocator &allocator = m_context.getDefaultAllocator();
6753 const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
6754 add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
6755
6756 if (m_subgroupSize < TessCtrlRandomProgram::minSubgroupSize || m_subgroupSize > 64)
6757 {
6758 std::stringstream str;
6759 str << "Subgroup size less than " << TessCtrlRandomProgram::minSubgroupSize
6760 << " or greater than 64 not handled.";
6761 str.flush();
6762 TCU_THROW(TestError, str.str());
6763 }
6764
6765 deRandom rnd;
6766 deRandom_init(&rnd, m_data.seed);
6767
6768 vk::VkPhysicalDeviceProperties2 properties2;
6769 deMemset(&properties2, 0, sizeof(properties2));
6770 properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
6771 m_context.getInstanceInterface().getPhysicalDeviceProperties2(m_context.getPhysicalDevice(), &properties2);
6772 const VkPhysicalDeviceLimits &limits = properties2.properties.limits;
6773
6774 const uint32_t patchControlPoints = 1;
6775 const uint32_t vertexCount =
6776 (m_subgroupSize / TessCtrlRandomProgram::minSubgroupSize) * patchControlPoints * m_data.sizeX;
6777 const uint32_t primitiveStride = vertexCount / patchControlPoints;
6778 de::MovePtr<BufferWithMemory> vertexBuffer =
6779 createVertexBufferAndFlush(vertexCount, 1u, VK_PRIMITIVE_TOPOLOGY_PATCH_LIST);
6780 const uint32_t invocationStride = vertexCount * TessCtrlRandomProgram::minSubgroupSize;
6781 DE_ASSERT(invocationStride < MAX_INVOCATIONS_ALL_TESTS);
6782
6783 log << tcu::TestLog::Message << "LayoutVertexOut: " << (uint32_t)TessCtrlRandomProgram::minSubgroupSize
6784 << tcu::TestLog::EndMessage;
6785 log << tcu::TestLog::Message << "patchControlPoints: " << patchControlPoints << tcu::TestLog::EndMessage;
6786 log << tcu::TestLog::Message << "primitiveStride: " << primitiveStride << tcu::TestLog::EndMessage;
6787 log << tcu::TestLog::Message << "invocationStride: " << invocationStride << tcu::TestLog::EndMessage;
6788 log << tcu::TestLog::Message << "usedSubgroupCount: " << m_data.sizeX << tcu::TestLog::EndMessage;
6789
6790 de::MovePtr<TessCtrlRandomProgram> program(new TessCtrlRandomProgram(m_data, invocationStride));
6791 program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
6792
6793 std::vector<uint64_t> ref;
6794 const uint32_t simulationMaxLoc = program->simulate(true, m_subgroupSize, ref);
6795 log << tcu::TestLog::Message << "simulated maxLoc: " << simulationMaxLoc << tcu::TestLog::EndMessage;
6796 // maxLoc is per-invocation. Add one (to make sure no additional writes are done)
6797 uint32_t maxLoc = simulationMaxLoc;
6798 maxLoc += 1;
6799 maxLoc *= invocationStride;
6800
6801 constexpr uint32_t bufferCount = 3;
6802 enum Bindings
6803 {
6804 InputA,
6805 OutputBallots,
6806 OutputCounts,
6807 };
6808
6809 de::MovePtr<BufferWithMemory> buffers[bufferCount];
6810 vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
6811
6812 VkDeviceSize sizes[bufferCount]{
6813 // InputA { uint a[]; } inputA; filled with a[i] == i
6814 invocationStride * sizeof(uint32_t),
6815 // OutputB { uvec2 b[]; } outputB;
6816 maxLoc * sizeof(uint64_t),
6817 // OutputC { uint loc[]; } outputC;
6818 invocationStride * sizeof(uint32_t),
6819 };
6820
6821 VkBufferUsageFlags usages[bufferCount]{
6822 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6823 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6824 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
6825 };
6826
6827 // allocate buffers
6828 for (uint32_t i = 0; i < bufferCount; ++i)
6829 {
6830 if (sizes[i] > limits.maxStorageBufferRange)
6831 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6832
6833 try
6834 {
6835 buffers[i] = de::MovePtr<BufferWithMemory>(
6836 new BufferWithMemory(vk, device, allocator,
6837 makeBufferCreateInfo(sizes[i], usages[i] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6838 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6839 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6840 }
6841 catch (tcu::ResourceError &)
6842 {
6843 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6844 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6845 "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
6846 }
6847 bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
6848 }
6849
6850 // get raw pointers to previously allocated buffers
6851 void *ptrs[bufferCount];
6852 for (uint32_t i = 0; i < bufferCount; ++i)
6853 {
6854 ptrs[i] = (uint32_t *)buffers[i]->getAllocation().getHostPtr();
6855 }
6856
6857 // populate buffers with their destination
6858 {
6859 auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], invocationStride);
6860 std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
6861 }
6862 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
6863 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
6864
6865 // (...) and flush them to the GPU
6866 for (uint32_t i = 0; i < bufferCount; ++i)
6867 {
6868 flushAlloc(vk, device, buffers[i]->getAllocation());
6869 }
6870
6871 VkDescriptorType descTypes[bufferCount]{
6872 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6873 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6874 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
6875 };
6876
6877 vk::DescriptorSetLayoutBuilder layoutBuilder;
6878 for (uint32_t i = 0; i < bufferCount; ++i)
6879 {
6880 layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
6881 }
6882 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
6883
6884 vk::DescriptorPoolBuilder poolBuilder;
6885 for (uint32_t i = 0; i < bufferCount; ++i)
6886 {
6887 poolBuilder.addType(descTypes[i], 1);
6888 }
6889 vk::Unique<vk::VkDescriptorPool> descriptorPool(
6890 poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
6891 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
6892
6893 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
6894 for (uint32_t i = 0; i < bufferCount; ++i)
6895 {
6896 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
6897 &bufferDescriptors[i]);
6898 }
6899 setUpdateBuilder.update(vk, device);
6900
6901 const VkPushConstantRange pushConstantRange{
6902 (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
6903 0u, // uint32_t offset;
6904 sizeof(PushConstant) // uint32_t size;
6905 };
6906
6907 // TODO: verify that PushConstant is available on running machine
6908
6909 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
6910 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
6911 DE_NULL, // pNext
6912 (VkPipelineLayoutCreateFlags)0, // flags
6913 1u, // setLayoutCount
6914 &descriptorSetLayout.get(), // pSetLayouts
6915 1u, // pushConstantRangeCount
6916 &pushConstantRange, // pPushConstantRanges
6917 };
6918
6919 const uint32_t imageWidth = 256;
6920 const uint32_t imageHeight = 256;
6921 const VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
6922 const VkImageCreateInfo imageCreateInfo{
6923 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
6924 nullptr, // const void* pNext;
6925 VkImageCreateFlags(0), // VkImageCreateFlags flags;
6926 VK_IMAGE_TYPE_2D, // VkImageType imageType;
6927 format, // VkFormat format;
6928 {imageWidth, imageHeight, 1u}, // VkExtent3D extent;
6929 1u, // uint32_t mipLevels;
6930 1u, // uint32_t arrayLayers;
6931 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
6932 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
6933 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
6934 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
6935 0u, // uint32_t queueFamilyIndexCount;
6936 0u, // const uint32_t* pQueueFamilyIndices;
6937 VK_IMAGE_LAYOUT_UNDEFINED // VkImageLayout initialLayout;
6938 };
6939 const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
6940 de::MovePtr<ImageWithMemory> image(
6941 new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
6942 Move<VkImageView> view = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
6943 Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
6944 Move<VkFramebuffer> framebuffer =
6945 makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
6946 const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
6947 const Shaders shaders = createShaders();
6948 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
6949 Move<VkPipeline> pipeline = createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders,
6950 VK_PRIMITIVE_TOPOLOGY_PATCH_LIST, patchControlPoints);
6951 Move<VkCommandPool> cmdPool =
6952 createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
6953 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
6954
6955 PushConstant pc{};
6956 pc.invocationStride = 0u;
6957 pc.width = TessCtrlRandomProgram::minSubgroupSize;
6958 pc.height = patchControlPoints;
6959 pc.primitiveStride = primitiveStride;
6960
6961 auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
6962 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
6963 std::cref(renderBeginInfo), **vertexBuffer, vertexCount, **image);
6964
6965 // compute "maxLoc", which is a potential maximum number of locations written
6966 callRecordDrawingAndSubmit();
6967
6968 // Take the maximum of "maxLoc" over all invocations.
6969 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
6970 auto rangeLoc = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], invocationStride);
6971 const uint32_t computedShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
6972 log << tcu::TestLog::Message << "computed shaderMaxLoc: " << computedShaderMaxLoc << tcu::TestLog::EndMessage;
6973
6974 // If we need more space, reallocate OutputB::b[] aka buffers[1]
6975 if (computedShaderMaxLoc > simulationMaxLoc)
6976 {
6977 // Add one (to make sure no additional writes are done) and multiply by
6978 // the number of invocations and current primitive count
6979 maxLoc = (computedShaderMaxLoc + 1) * invocationStride;
6980 sizes[OutputBallots] = maxLoc * sizeof(uint64_t);
6981
6982 if (sizes[OutputBallots] > limits.maxStorageBufferRange)
6983 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
6984
6985 try
6986 {
6987 buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
6988 vk, device, allocator,
6989 makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
6990 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
6991 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
6992 }
6993 catch (tcu::ResourceError &)
6994 {
6995 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
6996 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
6997 "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
6998 }
6999 bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
7000 ptrs[OutputBallots] = buffers[OutputBallots]->getAllocation().getHostPtr();
7001
7002 vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
7003 setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
7004 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
7005 setUpdateBuilder2.update(vk, device);
7006 }
7007
7008 // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
7009 // Note that its size would may change since the first memory allocation
7010 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7011 // Clear any writes to counting OutputC::loc[] aka buffer[2] during the counting pass
7012 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7013
7014 // flush them all to the GPU
7015 flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7016 flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7017
7018 // run the actual shader with updated PushConstant
7019 pc.invocationStride = invocationStride;
7020 pc.width = TessCtrlRandomProgram::minSubgroupSize;
7021 pc.height = patchControlPoints;
7022 pc.primitiveStride = primitiveStride;
7023 callRecordDrawingAndSubmit();
7024
7025 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7026 const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7027 log << tcu::TestLog::Message << "final shaderMaxLoc: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
7028 if (finalShaderMaxLoc > computedShaderMaxLoc)
7029 {
7030 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING, "maxLoc differs across shader invocations");
7031 }
7032
7033 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7034 invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7035
7036 // Simulate execution on the CPU, and compare against the GPU result
7037 try
7038 {
7039 ref.resize(maxLoc, 0ull);
7040 }
7041 catch (const std::bad_alloc &)
7042 {
7043 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7044 return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
7045 "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
7046 }
7047
7048 program->simulate(false, m_subgroupSize, ref);
7049
7050 const uint64_t *ballots = static_cast<uint64_t *>(ptrs[OutputBallots]);
7051 qpTestResult res = calculateAndLogResult(ballots, ref, invocationStride, m_subgroupSize, finalShaderMaxLoc,
7052 (invocationStride / 3), PrintMode::None);
7053
7054 return tcu::TestStatus(res, qpGetTestResultName(res));
7055 }
7056
createShaders(void)7057 std::vector<Move<VkShaderModule>> ReconvergenceTestTessEvalInstance::createShaders(void)
7058 {
7059 const DeviceInterface &vk = m_context.getDeviceInterface();
7060 const VkDevice device = m_context.getDevice();
7061
7062 Move<VkShaderModule> vertex = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"));
7063 Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"));
7064 Move<VkShaderModule> control = createShaderModule(vk, device, m_context.getBinaryCollection().get("tesc"));
7065 Move<VkShaderModule> evaluation = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"));
7066
7067 // { #vert, #frag, #tesc, #tese, geom }; if any
7068 std::vector<Move<VkShaderModule>> shaders;
7069 shaders.emplace_back(vertex);
7070 shaders.emplace_back(fragment);
7071 shaders.emplace_back(control);
7072 shaders.emplace_back(evaluation);
7073
7074 return shaders;
7075 }
7076
iterate(void)7077 tcu::TestStatus ReconvergenceTestTessEvalInstance::iterate(void)
7078 {
7079 const DeviceInterface &vk = m_context.getDeviceInterface();
7080 const VkDevice device = m_context.getDevice();
7081 Allocator &allocator = m_context.getDefaultAllocator();
7082 const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
7083 add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
7084
7085 if (m_subgroupSize < TessEvalRandomProgram::quadInvocationCount || m_subgroupSize > 64)
7086 {
7087 std::stringstream str;
7088 str << "Subgroup size less than " << TessEvalRandomProgram::quadInvocationCount
7089 << " or greater than 64 not handled.";
7090 str.flush();
7091 TCU_THROW(TestError, str.str());
7092 }
7093
7094 deRandom rnd;
7095 deRandom_init(&rnd, m_data.seed);
7096
7097 vk::VkPhysicalDeviceProperties2 properties2;
7098 deMemset(&properties2, 0, sizeof(properties2));
7099 properties2.sType = vk::VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
7100 m_context.getInstanceInterface().getPhysicalDeviceProperties2(m_context.getPhysicalDevice(), &properties2);
7101 const VkPhysicalDeviceLimits &limits = properties2.properties.limits;
7102
7103 const uint32_t patchesPerGroup = m_subgroupSize / TessEvalRandomProgram::quadInvocationCount;
7104 const uint32_t primitiveStride = patchesPerGroup * m_data.sizeX;
7105 const uint32_t invocationStride = primitiveStride * TessEvalRandomProgram::quadInvocationCount;
7106 const std::vector<tcu::Vec4> vertices = generateVertices(invocationStride, VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
7107 const uint32_t vertexCount = uint32_t(vertices.size());
7108 de::MovePtr<BufferWithMemory> vertexBuffer = createVertexBufferAndFlush(vertices);
7109
7110 DE_ASSERT(invocationStride <= MAX_INVOCATIONS_ALL_TESTS);
7111
7112 de::MovePtr<TessEvalRandomProgram> program(new TessEvalRandomProgram(m_data, invocationStride));
7113 program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
7114
7115 std::vector<uint64_t> ref;
7116 const uint32_t simulationMaxLoc = program->simulate(true, m_subgroupSize, ref);
7117 log << tcu::TestLog::Message << "simulated maxLoc: " << simulationMaxLoc << tcu::TestLog::EndMessage;
7118 log << tcu::TestLog::Message << "effective patch size: " << m_data.sizeY << tcu::TestLog::EndMessage;
7119 log << tcu::TestLog::Message << "effective patch count: " << primitiveStride << tcu::TestLog::EndMessage;
7120 log << tcu::TestLog::Message << "total invocation count: " << invocationStride << tcu::TestLog::EndMessage;
7121
7122 // maxLoc is per-invocation. Add one (to make sure no additional writes are done).
7123 uint32_t maxLoc = simulationMaxLoc;
7124 maxLoc += 1;
7125 maxLoc *= invocationStride;
7126
7127 constexpr uint32_t bufferCount = 3;
7128 enum Bindings
7129 {
7130 InputA,
7131 OutputBallots,
7132 OutputCounts,
7133 };
7134
7135 de::MovePtr<BufferWithMemory> buffers[bufferCount];
7136 vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
7137
7138 VkDeviceSize sizes[bufferCount]{
7139 // InputA { uint a[]; } inputA; filled with a[i] == i
7140 invocationStride * sizeof(uint32_t),
7141 // OutputB { uvec2 b[]; } outputB;
7142 maxLoc * sizeof(uint64_t),
7143 // OutputC { uint loc[]; } outputC;
7144 invocationStride * sizeof(uint32_t),
7145 };
7146
7147 VkBufferUsageFlags usages[bufferCount]{
7148 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7149 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7150 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7151 };
7152
7153 // allocate buffers
7154 for (uint32_t i = 0; i < bufferCount; ++i)
7155 {
7156 if (sizes[i] > limits.maxStorageBufferRange)
7157 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7158
7159 try
7160 {
7161 buffers[i] = de::MovePtr<BufferWithMemory>(
7162 new BufferWithMemory(vk, device, allocator,
7163 makeBufferCreateInfo(sizes[i], usages[i] | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
7164 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
7165 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7166 }
7167 catch (tcu::ResourceError &)
7168 {
7169 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7170 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7171 "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
7172 }
7173 bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
7174 }
7175
7176 // get raw pointers to previously allocated buffers
7177 void *ptrs[bufferCount];
7178 for (uint32_t i = 0; i < bufferCount; ++i)
7179 {
7180 ptrs[i] = (uint32_t *)buffers[i]->getAllocation().getHostPtr();
7181 }
7182
7183 // populate buffers with their destination
7184 {
7185 auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], invocationStride);
7186 std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
7187 }
7188 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7189 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7190
7191 // (...) and flush them to the GPU
7192 for (uint32_t i = 0; i < bufferCount; ++i)
7193 {
7194 flushAlloc(vk, device, buffers[i]->getAllocation());
7195 }
7196
7197 VkDescriptorType descTypes[bufferCount]{
7198 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7199 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7200 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7201 };
7202
7203 vk::DescriptorSetLayoutBuilder layoutBuilder;
7204 for (uint32_t i = 0; i < bufferCount; ++i)
7205 {
7206 layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
7207 }
7208 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
7209
7210 vk::DescriptorPoolBuilder poolBuilder;
7211 for (uint32_t i = 0; i < bufferCount; ++i)
7212 {
7213 poolBuilder.addType(descTypes[i], 1);
7214 }
7215 vk::Unique<vk::VkDescriptorPool> descriptorPool(
7216 poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
7217 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
7218
7219 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
7220 for (uint32_t i = 0; i < bufferCount; ++i)
7221 {
7222 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
7223 &bufferDescriptors[i]);
7224 }
7225 setUpdateBuilder.update(vk, device);
7226
7227 const VkPushConstantRange pushConstantRange{
7228 (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
7229 0u, // uint32_t offset;
7230 sizeof(PushConstant) // uint32_t size;
7231 };
7232
7233 // TODO: verify that PushConstant is available on running machine
7234
7235 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
7236 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
7237 DE_NULL, // pNext
7238 (VkPipelineLayoutCreateFlags)0, // flags
7239 1u, // setLayoutCount
7240 &descriptorSetLayout.get(), // pSetLayouts
7241 1u, // pushConstantRangeCount
7242 &pushConstantRange, // pPushConstantRanges
7243 };
7244
7245 const uint32_t imageWidth = 256;
7246 const uint32_t imageHeight = 256;
7247 const VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
7248 const VkImageCreateInfo imageCreateInfo{
7249 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
7250 nullptr, // const void* pNext;
7251 VkImageCreateFlags(0), // VkImageCreateFlags flags;
7252 VK_IMAGE_TYPE_2D, // VkImageType imageType;
7253 format, // VkFormat format;
7254 {imageWidth, imageHeight, 1u}, // VkExtent3D extent;
7255 1u, // uint32_t mipLevels;
7256 1u, // uint32_t arrayLayers;
7257 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
7258 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
7259 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
7260 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
7261 0u, // uint32_t queueFamilyIndexCount;
7262 0u, // const uint32_t* pQueueFamilyIndices;
7263 VK_IMAGE_LAYOUT_UNDEFINED // VkImageLayout initialLayout;
7264 };
7265 const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
7266 de::MovePtr<ImageWithMemory> image(
7267 new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
7268 Move<VkImageView> view = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
7269 Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
7270 Move<VkFramebuffer> framebuffer =
7271 makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
7272 const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
7273 const Shaders shaders = createShaders();
7274 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
7275 Move<VkPipeline> pipeline =
7276 createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders,
7277 VK_PRIMITIVE_TOPOLOGY_PATCH_LIST, TessEvalRandomProgram::quadInvocationCount);
7278 Move<VkCommandPool> cmdPool =
7279 createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
7280 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7281
7282 PushConstant pc{};
7283 pc.invocationStride = 0u;
7284 pc.width = TessEvalRandomProgram::quadInvocationCount;
7285
7286 auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
7287 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
7288 std::cref(renderBeginInfo), **vertexBuffer, vertexCount, **image);
7289
7290 // compute "maxLoc", which is a potential maximum number of locations written
7291 callRecordDrawingAndSubmit();
7292
7293 // Take the maximum of "maxLoc" over all invocations.
7294 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7295 auto rangeLoc = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], invocationStride);
7296 const uint32_t computedShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7297 log << tcu::TestLog::Message << "computed shaderMaxLoc: " << computedShaderMaxLoc << tcu::TestLog::EndMessage;
7298
7299 // If we need more space, reallocate OutputB::b[] aka buffers[1]
7300 if (computedShaderMaxLoc > simulationMaxLoc)
7301 {
7302 // Add one (to make sure no additional writes are done) and multiply by
7303 // the number of invocations and current primitive count
7304 maxLoc = (computedShaderMaxLoc + 1) * invocationStride;
7305 sizes[OutputBallots] = maxLoc * sizeof(uint64_t);
7306
7307 if (sizes[OutputBallots] > limits.maxStorageBufferRange)
7308 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7309
7310 try
7311 {
7312 buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
7313 vk, device, allocator,
7314 makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
7315 VK_BUFFER_USAGE_TRANSFER_SRC_BIT),
7316 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7317 }
7318 catch (tcu::ResourceError &)
7319 {
7320 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7321 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7322 "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
7323 }
7324 bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
7325 ptrs[OutputBallots] = buffers[OutputBallots]->getAllocation().getHostPtr();
7326
7327 vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
7328 setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
7329 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
7330 setUpdateBuilder2.update(vk, device);
7331 }
7332
7333 // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
7334 // Note that its size would may change since the first memory allocation
7335 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7336 // Clear any writes to counting OutputC::loc[] aka buffer[2] during the counting pass
7337 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7338
7339 // flush them all to the GPU
7340 flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7341 flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7342
7343 // run the actual shader with updated PushConstant
7344 pc.invocationStride = invocationStride;
7345 pc.width = TessEvalRandomProgram::quadInvocationCount;
7346 callRecordDrawingAndSubmit();
7347
7348 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7349 const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7350 log << tcu::TestLog::Message << "final shaderMaxLoc: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
7351 if (finalShaderMaxLoc > computedShaderMaxLoc)
7352 {
7353 std::stringstream s;
7354 s << "maxLoc differs across shader invocations: " << finalShaderMaxLoc << " and " << computedShaderMaxLoc;
7355 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING, s.str());
7356 }
7357
7358 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7359 invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7360
7361 // Simulate execution on the CPU, and compare against the GPU result
7362 try
7363 {
7364 ref.resize(maxLoc, 0ull);
7365 }
7366 catch (const std::bad_alloc &)
7367 {
7368 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7369 return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
7370 "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
7371 }
7372
7373 program->simulate(false, m_subgroupSize, ref);
7374
7375 const uint64_t *ballots = static_cast<uint64_t *>(ptrs[OutputBallots]);
7376 qpTestResult res = calculateAndLogResult(ballots, ref, invocationStride, m_subgroupSize, finalShaderMaxLoc,
7377 (invocationStride / 3), PrintMode::None);
7378
7379 return tcu::TestStatus(res, qpGetTestResultName(res));
7380 }
7381
createVertexBufferAndFlush(uint32_t cellsHorz,uint32_t cellsVert,VkPrimitiveTopology topology)7382 de::MovePtr<BufferWithMemory> ReconvergenceTestGeometryInstance::createVertexBufferAndFlush(
7383 uint32_t cellsHorz, uint32_t cellsVert, VkPrimitiveTopology topology)
7384 {
7385 DE_UNREF(topology);
7386 DE_ASSERT(VK_PRIMITIVE_TOPOLOGY_POINT_LIST == topology);
7387 const std::vector<tcu::Vec4> vertices = GeometryRandomProgram::Arrangement::generatePrimitives(
7388 cellsHorz, cellsVert, GeometryRandomProgram::fillPercentage);
7389 return ReconvergenceTestGraphicsInstance::createVertexBufferAndFlush(vertices);
7390 }
7391
createShaders(void)7392 std::vector<Move<VkShaderModule>> ReconvergenceTestGeometryInstance::createShaders(void)
7393 {
7394 const DeviceInterface &vk = m_context.getDeviceInterface();
7395 const VkDevice device = m_context.getDevice();
7396
7397 Move<VkShaderModule> vertex = createShaderModule(vk, device, m_context.getBinaryCollection().get("vert"));
7398 Move<VkShaderModule> fragment = createShaderModule(vk, device, m_context.getBinaryCollection().get("frag"));
7399 Move<VkShaderModule> geometry = createShaderModule(vk, device, m_context.getBinaryCollection().get("test"));
7400
7401 // { #vert, #frag, tesc, tese, #geom }; if any
7402 std::vector<Move<VkShaderModule>> shaders;
7403 shaders.emplace_back(vertex);
7404 shaders.emplace_back(fragment);
7405 shaders.emplace_back();
7406 shaders.emplace_back();
7407 shaders.emplace_back(geometry);
7408
7409 return shaders;
7410 }
7411
iterate(void)7412 tcu::TestStatus ReconvergenceTestGeometryInstance::iterate(void)
7413 {
7414 const VkPhysicalDeviceLimits &limits = m_context.getDeviceProperties().limits;
7415 if (sizeof(PushConstant) > limits.maxPushConstantsSize)
7416 {
7417 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7418 "PushConstant size " + std::to_string(sizeof(PushConstant)) + " exceeds device limit " +
7419 std::to_string(limits.maxPushConstantsSize));
7420 }
7421
7422 const DeviceInterface &vk = m_context.getDeviceInterface();
7423 const VkDevice device = m_context.getDevice();
7424 Allocator &allocator = m_context.getDefaultAllocator();
7425 const uint32_t queueIndex = m_context.getUniversalQueueFamilyIndex();
7426 add_ref<tcu::TestLog> log = m_context.getTestContext().getLog();
7427 const VkPrimitiveTopology topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
7428 const uint32_t fragmentStride = uint32_t(m_data.sizeX * m_data.sizeY);
7429 const uint32_t invocationStride = GeometryRandomProgram::Arrangement::calculatePrimitiveCount(
7430 m_data.sizeX, m_data.sizeY, GeometryRandomProgram::fillPercentage);
7431
7432 de::MovePtr<GeometryRandomProgram> program(new GeometryRandomProgram(m_data));
7433 program->generateRandomProgram(m_context.getTestContext().getWatchDog(), log);
7434
7435 // simulate content of outputP buffer
7436 std::vector<uint32_t> outputP =
7437 GeometryRandomProgram::Arrangement::generateVectorOutputP(m_subgroupSize, invocationStride);
7438
7439 std::vector<tcu::UVec4> ref;
7440 const uint32_t hostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), true, m_subgroupSize,
7441 fragmentStride, invocationStride, ref, log, outputP, nullptr);
7442 log << tcu::TestLog::Message << "Rendering area : " << tcu::UVec2(m_data.sizeX, m_data.sizeY)
7443 << tcu::TestLog::EndMessage;
7444 log << tcu::TestLog::Message << "invocationStride: " << invocationStride << tcu::TestLog::EndMessage;
7445 log << tcu::TestLog::Message << "Simulated maxLoc: " << hostMaxLoc << tcu::TestLog::EndMessage;
7446 // maxLoc is per-invocation. Add one (to make sure no additional writes are done).
7447 uint32_t maxLoc = hostMaxLoc;
7448 maxLoc += 1;
7449 maxLoc *= invocationStride;
7450
7451 constexpr uint32_t bufferCount = 4u;
7452 enum Bindings
7453 {
7454 InputA,
7455 OutputBallots,
7456 OutputCounts,
7457 OutputPrimitives
7458 };
7459
7460 de::MovePtr<BufferWithMemory> buffers[bufferCount];
7461 vk::VkDescriptorBufferInfo bufferDescriptors[bufferCount];
7462
7463 uint32_t counts[bufferCount]{// InputA { uint a[]; } inputA;
7464 uint32_t(m_data.sizeX * m_data.sizeY),
7465 // OutputB { uvec2 b[]; } outputB;
7466 maxLoc,
7467 // OutputC { uint loc[]; } outputC;
7468 invocationStride,
7469 // OutputP { uint p[]; } outputP;
7470 uint32_t(outputP.size())};
7471
7472 VkDeviceSize sizes[bufferCount]{// InputA { uint a[]; } inputA;
7473 counts[InputA] * sizeof(uint32_t),
7474 // OutputB { uvec2 b[]; } outputB;
7475 counts[OutputBallots] * sizeof(tcu::UVec4),
7476 // OutputC { uint loc[]; } outputC;
7477 counts[OutputCounts] * sizeof(uint32_t),
7478 // OutputP { uint p[]; } outputP;
7479 counts[OutputPrimitives] * sizeof(uint32_t)};
7480
7481 const VkBufferUsageFlags cmnUsages = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
7482 VkBufferUsageFlags usages[bufferCount]{
7483 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7484 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7485 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7486 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
7487 };
7488
7489 // allocate buffers
7490 for (uint32_t i = 0; i < bufferCount; ++i)
7491 {
7492 if (sizes[i] > limits.maxStorageBufferRange)
7493 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7494 try
7495 {
7496 buffers[i] = de::MovePtr<BufferWithMemory>(
7497 new BufferWithMemory(vk, device, allocator, makeBufferCreateInfo(sizes[i], usages[i] | cmnUsages),
7498 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7499 }
7500 catch (tcu::ResourceError &)
7501 {
7502 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7503 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7504 "Failed device memory allocation " + de::toString(sizes[i]) + " bytes");
7505 }
7506 bufferDescriptors[i] = makeDescriptorBufferInfo(**buffers[i], 0, sizes[i]);
7507 }
7508
7509 // get raw pointers to previously allocated buffers
7510 void *ptrs[bufferCount];
7511 for (uint32_t i = 0; i < bufferCount; ++i)
7512 {
7513 ptrs[i] = (uint32_t *)buffers[i]->getAllocation().getHostPtr();
7514 }
7515
7516 // populate buffers with their destination
7517 {
7518 auto rangeBufferA = makeStdBeginEnd<uint32_t>(ptrs[InputA], counts[InputA]);
7519 std::iota(rangeBufferA.first, rangeBufferA.second, 0u);
7520 }
7521 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7522 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7523 deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
7524
7525 // (...) and flush them to the GPU
7526 for (uint32_t i = 0; i < bufferCount; ++i)
7527 {
7528 flushAlloc(vk, device, buffers[i]->getAllocation());
7529 }
7530
7531 VkDescriptorType descTypes[bufferCount]{
7532 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7533 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7534 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7535 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
7536 };
7537
7538 vk::DescriptorSetLayoutBuilder layoutBuilder;
7539 for (uint32_t i = 0; i < bufferCount; ++i)
7540 {
7541 layoutBuilder.addSingleBinding(descTypes[i], m_data.shaderStage);
7542 }
7543 vk::Unique<vk::VkDescriptorSetLayout> descriptorSetLayout(layoutBuilder.build(vk, device));
7544
7545 vk::DescriptorPoolBuilder poolBuilder;
7546 for (uint32_t i = 0; i < bufferCount; ++i)
7547 {
7548 poolBuilder.addType(descTypes[i], 1);
7549 }
7550 vk::Unique<vk::VkDescriptorPool> descriptorPool(
7551 poolBuilder.build(vk, device, VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, 1u));
7552 vk::Unique<vk::VkDescriptorSet> descriptorSet(makeDescriptorSet(vk, device, *descriptorPool, *descriptorSetLayout));
7553
7554 vk::DescriptorSetUpdateBuilder setUpdateBuilder;
7555 for (uint32_t i = 0; i < bufferCount; ++i)
7556 {
7557 setUpdateBuilder.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(i), descTypes[i],
7558 &bufferDescriptors[i]);
7559 }
7560 setUpdateBuilder.update(vk, device);
7561
7562 const VkPushConstantRange pushConstantRange{
7563 (VkShaderStageFlags)m_data.shaderStage, // VkShaderStageFlags stageFlags;
7564 0u, // uint32_t offset;
7565 sizeof(PushConstant) // uint32_t size;
7566 };
7567
7568 const VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo{
7569 VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
7570 DE_NULL, // pNext
7571 (VkPipelineLayoutCreateFlags)0, // flags
7572 1u, // setLayoutCount
7573 &descriptorSetLayout.get(), // pSetLayouts
7574 1u, // pushConstantRangeCount
7575 &pushConstantRange, // pPushConstantRanges
7576 };
7577
7578 const uint32_t imageWidth = m_data.sizeX;
7579 const uint32_t imageHeight = m_data.sizeY;
7580 const VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
7581 const VkImageCreateInfo imageCreateInfo{
7582 VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // VkStructureType sType;
7583 nullptr, // const void* pNext;
7584 VkImageCreateFlags(0), // VkImageCreateFlags flags;
7585 VK_IMAGE_TYPE_2D, // VkImageType imageType;
7586 format, // VkFormat format;
7587 {imageWidth, imageHeight, 1u}, // VkExtent3D extent;
7588 1u, // uint32_t mipLevels;
7589 1u, // uint32_t arrayLayers;
7590 VK_SAMPLE_COUNT_1_BIT, // VkSampleCountFlagBits samples;
7591 VK_IMAGE_TILING_OPTIMAL, // VkImageTiling tiling;
7592 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, // VkImageUsageFlags usage;
7593 VK_SHARING_MODE_EXCLUSIVE, // VkSharingMode sharingMode;
7594 0u, // uint32_t queueFamilyIndexCount;
7595 0u, // const uint32_t* pQueueFamilyIndices;
7596 VK_IMAGE_LAYOUT_UNDEFINED // VkImageLayout initialLayout;
7597 };
7598 const VkImageSubresourceRange rscRange = makeImageSubresourceRange(VK_IMAGE_ASPECT_COLOR_BIT, 0u, 1u, 0u, 1u);
7599 de::MovePtr<ImageWithMemory> image(
7600 new ImageWithMemory(vk, device, allocator, imageCreateInfo, vk::MemoryRequirement::Any));
7601 Move<VkImageView> view = makeImageView(vk, device, **image, VK_IMAGE_VIEW_TYPE_2D, format, rscRange);
7602 Move<VkRenderPass> renderPass = makeRenderPass(vk, device, format);
7603 Move<VkFramebuffer> framebuffer =
7604 makeFramebuffer(vk, device, *renderPass, *view, m_data.sizeX, m_data.sizeY, rscRange.layerCount);
7605 de::MovePtr<BufferWithMemory> vertexBuffer = createVertexBufferAndFlush(m_data.sizeX, m_data.sizeY, topology);
7606 const VkRenderPassBeginInfo renderBeginInfo = makeRenderPassBeginInfo(*renderPass, *framebuffer);
7607 const Shaders shaders = createShaders();
7608 Move<VkPipelineLayout> pipelineLayout = createPipelineLayout(vk, device, &pipelineLayoutCreateInfo, NULL);
7609 Move<VkPipeline> pipeline = createGraphicsPipeline(*pipelineLayout, *renderPass, imageWidth, imageHeight, shaders,
7610 VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
7611 Move<VkCommandPool> cmdPool =
7612 createCommandPool(vk, device, vk::VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, queueIndex);
7613 Move<VkCommandBuffer> cmdBuffer = allocateCommandBuffer(vk, device, *cmdPool, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7614
7615 PushConstant pc{};
7616 pc.invocationStride = invocationStride;
7617 pc.width = m_data.sizeX;
7618 pc.height = m_data.sizeY;
7619 pc.enableInvocationIndex = VK_FALSE;
7620
7621 auto callRecordDrawingAndSubmit = std::bind(&ReconvergenceTestGraphicsInstance::recordDrawingAndSubmit, this,
7622 *cmdBuffer, *pipelineLayout, *pipeline, *descriptorSet, std::cref(pc),
7623 std::cref(renderBeginInfo), **vertexBuffer, invocationStride, **image);
7624
7625 // compute "maxLoc", which is a potential maximum number of locations written
7626 callRecordDrawingAndSubmit();
7627
7628 // Take the maximum of "maxLoc" over all invocations.
7629 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7630 auto rangeLoc = makeStdBeginEnd<const uint32_t>(ptrs[OutputCounts], invocationStride);
7631 const uint32_t shaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7632 log << tcu::TestLog::Message << "computed maxLoc in shader: " << shaderMaxLoc << tcu::TestLog::EndMessage;
7633
7634 // If we need more space, reallocate OutputB::b[] aka buffers[1]
7635 if (shaderMaxLoc > hostMaxLoc)
7636 {
7637 // Add one (to make sure no additional writes are done) and multiply by
7638 // the number of invocations and current primitive count
7639 maxLoc = (std::max(shaderMaxLoc, hostMaxLoc) + 1u) * invocationStride;
7640 counts[OutputBallots] = maxLoc;
7641 sizes[OutputBallots] = counts[OutputBallots] * sizeof(tcu::UVec4);
7642
7643 if (sizes[OutputBallots] > limits.maxStorageBufferRange)
7644 TCU_THROW(NotSupportedError, "Storage buffer size larger than device limits");
7645
7646 try
7647 {
7648 buffers[OutputBallots] = de::MovePtr<BufferWithMemory>(new BufferWithMemory(
7649 vk, device, allocator, makeBufferCreateInfo(sizes[1], VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | cmnUsages),
7650 MemoryRequirement::HostVisible | MemoryRequirement::Cached));
7651 }
7652 catch (tcu::ResourceError &)
7653 {
7654 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7655 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7656 "Failed device memory allocation " + de::toString(sizes[OutputBallots]) + " bytes");
7657 }
7658 bufferDescriptors[OutputBallots] = makeDescriptorBufferInfo(**buffers[OutputBallots], 0, sizes[OutputBallots]);
7659 ptrs[OutputBallots] = buffers[OutputBallots]->getAllocation().getHostPtr();
7660
7661 vk::DescriptorSetUpdateBuilder setUpdateBuilder2;
7662 setUpdateBuilder2.writeSingle(*descriptorSet, vk::DescriptorSetUpdateBuilder::Location::binding(OutputBallots),
7663 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &bufferDescriptors[OutputBallots]);
7664 setUpdateBuilder2.update(vk, device);
7665 }
7666
7667 // Clear any writes to ballots/stores OutputB::b[] aka buffer[1] during the counting pass
7668 // Note that its size would may change since the first memory allocation
7669 deMemset(ptrs[OutputBallots], 0, (size_t)sizes[OutputBallots]);
7670 deMemset(ptrs[OutputCounts], 0, (size_t)sizes[OutputCounts]);
7671 deMemset(ptrs[OutputPrimitives], 0, (size_t)sizes[OutputPrimitives]);
7672
7673 // flush them all to the GPU
7674 flushAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7675 flushAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7676 flushAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
7677
7678 // run the actual shader with updated PushConstant
7679 pc.enableInvocationIndex = VK_TRUE;
7680 callRecordDrawingAndSubmit();
7681
7682 invalidateAlloc(vk, device, buffers[OutputCounts]->getAllocation());
7683 const uint32_t finalShaderMaxLoc = (*max_element(rangeLoc.first, rangeLoc.second));
7684 log << tcu::TestLog::Message << "final shaderMaxLoc: " << finalShaderMaxLoc << tcu::TestLog::EndMessage;
7685 if (finalShaderMaxLoc != shaderMaxLoc)
7686 {
7687 return tcu::TestStatus(QP_TEST_RESULT_QUALITY_WARNING,
7688 "maxLoc differs across shader invocations, expected: " + de::toString(shaderMaxLoc) +
7689 " got: " + de::toString(finalShaderMaxLoc));
7690 }
7691
7692 invalidateAlloc(vk, device, buffers[OutputBallots]->getAllocation());
7693 const tcu::UVec4 *ballots = static_cast<tcu::UVec4 *>(ptrs[OutputBallots]);
7694
7695 invalidateAlloc(vk, device, buffers[OutputPrimitives]->getAllocation());
7696 auto outputPrange = makeStdBeginEnd<uint32_t>(ptrs[OutputPrimitives], counts[OutputPrimitives]);
7697 std::copy(outputPrange.first, outputPrange.second, outputP.begin());
7698
7699 try
7700 {
7701 ref.resize(counts[OutputBallots], tcu::UVec4(0u, 0u, 0u, 0u));
7702 }
7703 catch (const std::bad_alloc &)
7704 {
7705 // Allocation size is unpredictable and can be too large for some systems. Don't treat allocation failure as a test failure.
7706 return tcu::TestStatus(QP_TEST_RESULT_NOT_SUPPORTED,
7707 "Failed system memory allocation " + de::toString(maxLoc * sizeof(uint64_t)) + " bytes");
7708 }
7709
7710 // Simulate execution on the CPU, and compare against the GPU result
7711 const uint32_t finalHostMaxLoc = program->execute(m_context.getTestContext().getWatchDog(), false, m_subgroupSize,
7712 fragmentStride, invocationStride, ref, log, outputP, ballots);
7713
7714 const qpTestResult res = calculateAndLogResultEx(log, ballots, ref, finalHostMaxLoc, PrintMode::None);
7715
7716 return tcu::TestStatus(res, qpGetTestResultName(res));
7717 }
7718
calculateAndLogResultEx(add_ref<tcu::TestLog> log,const tcu::UVec4 * result,const std::vector<tcu::UVec4> & ref,const uint32_t maxLoc,const PrintMode printMode)7719 qpTestResult_e ReconvergenceTestGeometryInstance::calculateAndLogResultEx(add_ref<tcu::TestLog> log,
7720 const tcu::UVec4 *result,
7721 const std::vector<tcu::UVec4> &ref,
7722 const uint32_t maxLoc,
7723 const PrintMode printMode)
7724 {
7725 DE_UNREF(maxLoc);
7726 DE_UNREF(printMode);
7727
7728 qpTestResult res = QP_TEST_RESULT_PASS;
7729 uint32_t mismatchCount = 0u;
7730 const uint32_t printMismatchCount = 5u;
7731
7732 // With maximal reconvergence, we should expect the output to exactly match the reference.
7733 const uint32_t ballotStoreCount = static_cast<uint32_t>(ref.size());
7734 for (uint32_t i = 0; i < ballotStoreCount; ++i)
7735 {
7736 const Ballot resultVal(result[i], m_subgroupSize);
7737 const Ballot refVal(ref.at(i), m_subgroupSize);
7738 if (resultVal != refVal)
7739 {
7740 if (mismatchCount++ < printMismatchCount)
7741 {
7742 res = QP_TEST_RESULT_FAIL;
7743 log << tcu::TestLog::Message << "Mismatch at " << i << "\nexpected: " << resultVal
7744 << "\n got: " << refVal << tcu::TestLog::EndMessage;
7745 if (printMode == PrintMode::Console)
7746 {
7747 std::cout << "Mismatch at " << i << "\nexpected: " << resultVal << "\n got: " << refVal
7748 << std::endl;
7749 }
7750 }
7751 }
7752 }
7753
7754 log << tcu::TestLog::Message << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount
7755 << tcu::TestLog::EndMessage;
7756 if (printMode == PrintMode::Console)
7757 {
7758 std::cout << "Mismatch count: " << mismatchCount << " from " << ballotStoreCount << std::endl;
7759 }
7760
7761 return res;
7762 }
7763
7764 void createAmberFragmentTestCases(add_ref<tcu::TestContext> testCtx, add_ptr<tcu::TestCaseGroup> group);
7765
createTests(tcu::TestContext & testCtx,const std::string & name,bool createExperimental)7766 tcu::TestCaseGroup *createTests(tcu::TestContext &testCtx, const std::string &name, bool createExperimental)
7767 {
7768 de::MovePtr<tcu::TestCaseGroup> group(new tcu::TestCaseGroup(testCtx, name.c_str(), "reconvergence tests"));
7769
7770 typedef struct
7771 {
7772 uint32_t value;
7773 const char *name;
7774 const char *description;
7775 } TestGroupCase;
7776
7777 TestGroupCase ttCases[] = {
7778 {TT_SUCF_ELECT, "subgroup_uniform_control_flow_elect", "subgroup_uniform_control_flow_elect"},
7779 {TT_SUCF_BALLOT, "subgroup_uniform_control_flow_ballot", "subgroup_uniform_control_flow_ballot"},
7780 {TT_WUCF_ELECT, "workgroup_uniform_control_flow_elect", "workgroup_uniform_control_flow_elect"},
7781 {TT_WUCF_BALLOT, "workgroup_uniform_control_flow_ballot", "workgroup_uniform_control_flow_ballot"},
7782 {TT_MAXIMAL, "maximal", "maximal"},
7783 };
7784
7785 std::pair<VkShaderStageFlagBits, const char *> const stTypes[]{
7786 {VK_SHADER_STAGE_COMPUTE_BIT, "compute"},
7787 {VK_SHADER_STAGE_FRAGMENT_BIT, "fragment"},
7788 #ifdef INCLUDE_GRAPHICS_TESTS
7789 {VK_SHADER_STAGE_VERTEX_BIT, "vertex"},
7790 {VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT, "tessctrl"},
7791 {VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, "tesseval"},
7792 {VK_SHADER_STAGE_GEOMETRY_BIT, "geometry"},
7793 #endif
7794 };
7795
7796 for (int ttNdx = 0; ttNdx < DE_LENGTH_OF_ARRAY(ttCases); ttNdx++)
7797 {
7798 de::MovePtr<tcu::TestCaseGroup> ttGroup(
7799 new tcu::TestCaseGroup(testCtx, ttCases[ttNdx].name, ttCases[ttNdx].description));
7800
7801 for (int stNdx = 0; stNdx < DE_LENGTH_OF_ARRAY(stTypes); ++stNdx)
7802 {
7803 // Only 'maximal' tests can process this loop when we are dealing with various kind of shaders,
7804 if (stTypes[stNdx].first != VK_SHADER_STAGE_COMPUTE_BIT && ttCases[ttNdx].value != TT_MAXIMAL)
7805 continue;
7806
7807 de::MovePtr<tcu::TestCaseGroup> shaderGroup(new tcu::TestCaseGroup(testCtx, stTypes[stNdx].second, ""));
7808
7809 uint32_t nNdx = 2;
7810
7811 if (stTypes[stNdx].first == VK_SHADER_STAGE_FRAGMENT_BIT)
7812 {
7813 nNdx = 7;
7814 createAmberFragmentTestCases(testCtx, shaderGroup.get());
7815 }
7816
7817 for (/*uint32_t nNdx = 2*/; nNdx <= 6; nNdx++)
7818 {
7819 de::MovePtr<tcu::TestCaseGroup> nestGroup(
7820 new tcu::TestCaseGroup(testCtx, ("nesting" + de::toString(nNdx)).c_str(), ""));
7821
7822 uint32_t seed = 0;
7823
7824 for (int sNdx = 0; sNdx < 8; sNdx++)
7825 {
7826 de::MovePtr<tcu::TestCaseGroup> seedGroup(
7827 new tcu::TestCaseGroup(testCtx, de::toString(sNdx).c_str(), ""));
7828
7829 uint32_t numTests = 0;
7830 switch (nNdx)
7831 {
7832 default:
7833 DE_ASSERT(0);
7834 // fallthrough
7835 case 2:
7836 case 3:
7837 case 4:
7838 numTests = 250;
7839 break;
7840 case 5:
7841 numTests = 100;
7842 break;
7843 case 6:
7844 numTests = 50;
7845 break;
7846 }
7847
7848 if (ttCases[ttNdx].value != TT_MAXIMAL)
7849 {
7850 if (nNdx >= 5)
7851 continue;
7852 }
7853
7854 for (uint32_t ndx = 0; ndx < numTests; ndx++)
7855 {
7856 uint32_t dim = 0u;
7857 DE_UNREF(dim);
7858 uint32_t sizeX = 0u;
7859 uint32_t sizeY = 0u;
7860 switch (stTypes[stNdx].first)
7861 {
7862 case VK_SHADER_STAGE_COMPUTE_BIT:
7863 // we want to test at least full subgroup
7864 // both are primary numbers
7865 sizeX = 7u;
7866 sizeY = 13u;
7867 break;
7868 case VK_SHADER_STAGE_FRAGMENT_BIT:
7869 sizeX = 32;
7870 sizeY = 32;
7871 break;
7872 case VK_SHADER_STAGE_VERTEX_BIT:
7873 // we want to test at least full subgroup
7874 dim = uint32_t(std::ceil(
7875 std::sqrt((double)(((128u + 31u) * 100u) / VertexRandomProgram::fillPercentage))));
7876 sizeX = dim;
7877 sizeY = dim;
7878 break;
7879 case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
7880 sizeX = 19; // positive number of desired subgroups
7881 sizeY = 1; // used only for framebuffer extent in TCS test
7882 break;
7883 case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
7884 sizeX = 23; // positive number of desired subgroups
7885 sizeY = 1; // used only for framebuffer extent in TES test
7886 break;
7887 case VK_SHADER_STAGE_GEOMETRY_BIT:
7888 // we want to test at least full subgroup
7889 dim = uint32_t(std::ceil(
7890 std::sqrt((double)(((128u + 29u) * 100u) / GeometryRandomProgram::fillPercentage))));
7891 sizeX = dim;
7892 sizeY = dim;
7893 break;
7894 default:
7895 DE_ASSERT(0);
7896 }
7897 CaseDef c = {
7898 stTypes[stNdx].first, // VkShaderStageFlagBits shaderStage
7899 (TestType)ttCases[ttNdx].value, // TestType testType;
7900 nNdx, // uint32_t maxNesting;
7901 seed, // uint32_t seed;
7902 sizeX, // uint32_t sizeX;
7903 sizeY // uint32_t sizeY;
7904 };
7905 // product of sizeX and sizeY must not exceed MAX_INVOCATIONS_ALL_TESTS
7906 DE_ASSERT(c.verify());
7907 seed++;
7908
7909 bool isExperimentalTest = (ndx >= numTests / 5);
7910
7911 if (createExperimental == isExperimentalTest)
7912 seedGroup->addChild(new ReconvergenceTestCase(testCtx, de::toString(ndx).c_str(), c));
7913 }
7914 if (!seedGroup->empty())
7915 nestGroup->addChild(seedGroup.release());
7916 }
7917 if (!nestGroup->empty())
7918 shaderGroup->addChild(nestGroup.release());
7919 }
7920 if (!shaderGroup->empty())
7921 ttGroup->addChild(shaderGroup.release());
7922 }
7923 group->addChild(ttGroup.release());
7924 }
7925
7926 return group.release();
7927 }
7928
createAmberFragmentTestCases(add_ref<tcu::TestContext> testCtx,add_ptr<tcu::TestCaseGroup> group)7929 void createAmberFragmentTestCases(add_ref<tcu::TestContext> testCtx, add_ptr<tcu::TestCaseGroup> group)
7930 {
7931 using namespace cts_amber;
7932
7933 enum Tests
7934 {
7935 TERMINATE_INVOCATION,
7936 DEMOTE_INVOCATION,
7937 DEMOTE_ENTIRE_QUAD,
7938 DEMOTE_HALF_QUAD_TOP,
7939 DEMOTE_HALF_QUAD_RIGHT,
7940 DEMOTE_HALF_QUAD_BOTTOM,
7941 DEMOTE_HALF_QUAD_LEFT,
7942 DEMOTE_HALF_QUAD_SLASH,
7943 DEMOTE_HALF_QUAD_BACKSLASH
7944 };
7945
7946 struct Case
7947 {
7948 Tests test;
7949 add_cptr<char> name;
7950 add_cptr<char> desc;
7951 std::size_t hname;
7952 Case(Tests aTest, add_cptr<char> aName, add_cptr<char> aDesc)
7953 : test(aTest)
7954 , name(aName)
7955 , desc(aDesc)
7956 , hname(std::hash<std::string>()(std::string(aName)))
7957 {
7958 }
7959 bool matches(add_cref<std::string> aName) const
7960 {
7961 return hname == std::hash<std::string>()(aName);
7962 }
7963 static bool matches(add_cref<std::string> aName, std::initializer_list<Case> aList)
7964 {
7965 for (auto i = aList.begin(); i != aList.end(); ++i)
7966 {
7967 if (i->matches(aName))
7968 return true;
7969 }
7970 return false;
7971 }
7972 std::string makeFileName() const
7973 {
7974 return (std::string(name) + ".amber");
7975 }
7976 } static const cases[]{
7977 Case(TERMINATE_INVOCATION, "terminate_invocation",
7978 "Verifies that terminated invocation is no longer included in the ballot"),
7979 Case(DEMOTE_INVOCATION, "demote_invocation",
7980 "Verifies that the demoted invocation is not present in the ballot"),
7981 Case(DEMOTE_ENTIRE_QUAD, "demote_entire_quad", "Verifies that the demoted quad is not present in the ballot"),
7982 Case(DEMOTE_HALF_QUAD_TOP, "demote_half_quad_top",
7983 "Verifies that the demoted part of the quad is not present in the ballot"),
7984 Case(DEMOTE_HALF_QUAD_RIGHT, "demote_half_quad_right",
7985 "Verifies that the demoted part of the quad is not present in the ballot"),
7986 Case(DEMOTE_HALF_QUAD_BOTTOM, "demote_half_quad_bottom",
7987 "Verifies that the demoted part of the quad is not present in the ballot"),
7988 Case(DEMOTE_HALF_QUAD_LEFT, "demote_half_quad_left",
7989 "Verifies that the demoted part of the quad is not present in the ballot"),
7990 Case(DEMOTE_HALF_QUAD_SLASH, "demote_half_quad_slash",
7991 "Verifies that the demoted part of the quad is not present in the ballot"),
7992 Case(DEMOTE_HALF_QUAD_BACKSLASH, "demote_half_quad_backslash",
7993 "Verifies that the demoted part of the quad is not present in the ballot"),
7994 };
7995
7996 auto testSupports = [](Context &context, std::string testName) -> void
7997 {
7998 if (!(context.getSubgroupProperties().supportedStages & VK_SHADER_STAGE_FRAGMENT_BIT))
7999 TCU_THROW(NotSupportedError, "Subgroup operations not supported in fragment stage");
8000
8001 if (!context.getShaderMaximalReconvergenceFeatures().shaderMaximalReconvergence)
8002 TCU_THROW(NotSupportedError, "shaderMaximalReconvergence not supported");
8003
8004 if (!(context.getSubgroupProperties().supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT))
8005 TCU_THROW(NotSupportedError, "VK_SUBGROUP_FEATURE_BALLOT_BIT not supported");
8006
8007 if (Case::matches(testName, {cases[DEMOTE_ENTIRE_QUAD]}))
8008 {
8009 if (!(context.getSubgroupProperties().subgroupSize > 4))
8010 TCU_THROW(NotSupportedError, "subgroupSize is less than or equal to 4");
8011 }
8012 else
8013 {
8014 if (!(context.getSubgroupProperties().subgroupSize >= 4))
8015 TCU_THROW(NotSupportedError, "subgroupSize is less than 4");
8016 }
8017
8018 if (Case::matches(testName, {cases[TERMINATE_INVOCATION]}))
8019 {
8020 if (!context.getShaderTerminateInvocationFeatures().shaderTerminateInvocation)
8021 TCU_THROW(NotSupportedError, "shaderTerminateInvocation not supported.");
8022 }
8023 else
8024 {
8025 #ifndef CTS_USES_VULKANSC
8026 if (!context.getShaderDemoteToHelperInvocationFeatures().shaderDemoteToHelperInvocation)
8027 TCU_THROW(NotSupportedError, "demoteToHelperInvocation not supported.");
8028 #else
8029 if (!context.getShaderDemoteToHelperInvocationFeaturesEXT().shaderDemoteToHelperInvocation)
8030 TCU_THROW(NotSupportedError, "demoteToHelperInvocation not supported.");
8031 #endif
8032 }
8033 };
8034
8035 auto updateTest = [&](add_ptr<AmberTestCase> theTest) -> add_ptr<AmberTestCase>
8036 {
8037 theTest->setCheckSupportCallback(testSupports);
8038 return theTest;
8039 };
8040
8041 const std::string testsFolder(std::string("reconvergence/maximal/") + group->getName());
8042
8043 for (add_cref<Case> aCase : cases)
8044 {
8045 group->addChild(updateTest(
8046 createAmberTestCase(testCtx, aCase.name, aCase.desc, testsFolder.c_str(), aCase.makeFileName())));
8047 }
8048 }
8049
8050 } // namespace
8051
createTests(tcu::TestContext & testCtx,const std::string & name)8052 tcu::TestCaseGroup *createTests(tcu::TestContext &testCtx, const std::string &name)
8053 {
8054 return createTests(testCtx, name, false);
8055 }
8056
createTestsExperimental(tcu::TestContext & testCtx,const std::string & name)8057 tcu::TestCaseGroup *createTestsExperimental(tcu::TestContext &testCtx, const std::string &name)
8058 {
8059 return createTests(testCtx, name, true);
8060 }
8061
8062 } // namespace Reconvergence
8063 } // namespace vkt
8064