xref: /aosp_15_r20/external/skia/tests/graphite/ComputeTest.cpp (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 /*
2  * Copyright 2022 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "tests/Test.h"
9 
10 #include "include/core/SkBitmap.h"
11 #include "include/gpu/graphite/Context.h"
12 #include "include/gpu/graphite/Recorder.h"
13 #include "include/gpu/graphite/Recording.h"
14 #include "src/gpu/graphite/Buffer.h"
15 #include "src/gpu/graphite/Caps.h"
16 #include "src/gpu/graphite/ComputePipelineDesc.h"
17 #include "src/gpu/graphite/ComputeTypes.h"
18 #include "src/gpu/graphite/ContextPriv.h"
19 #include "src/gpu/graphite/RecorderPriv.h"
20 #include "src/gpu/graphite/ResourceProvider.h"
21 #include "src/gpu/graphite/UniformManager.h"
22 #include "src/gpu/graphite/compute/ComputeStep.h"
23 #include "src/gpu/graphite/compute/DispatchGroup.h"
24 #include "src/gpu/graphite/task/ComputeTask.h"
25 #include "src/gpu/graphite/task/CopyTask.h"
26 #include "src/gpu/graphite/task/SynchronizeToCpuTask.h"
27 #include "src/gpu/graphite/task/UploadTask.h"
28 
29 #include "tools/graphite/GraphiteTestContext.h"
30 
31 using namespace skgpu::graphite;
32 using namespace skiatest::graphite;
33 
34 namespace {
35 
map_buffer(Context * context,skiatest::graphite::GraphiteTestContext * testContext,Buffer * buffer,size_t offset)36 void* map_buffer(Context* context,
37                  skiatest::graphite::GraphiteTestContext* testContext,
38                  Buffer* buffer,
39                  size_t offset) {
40     SkASSERT(buffer);
41     if (context->priv().caps()->bufferMapsAreAsync()) {
42         buffer->asyncMap();
43         while (!buffer->isMapped()) {
44             testContext->tick();
45         }
46     }
47     std::byte* ptr = static_cast<std::byte*>(buffer->map());
48     SkASSERT(ptr);
49 
50     return ptr + offset;
51 }
52 
sync_buffer_to_cpu(Recorder * recorder,const Buffer * buffer)53 sk_sp<Buffer> sync_buffer_to_cpu(Recorder* recorder, const Buffer* buffer) {
54     if (recorder->priv().caps()->drawBufferCanBeMappedForReadback()) {
55         // `buffer` can be mapped directly, however it may still require a synchronization step
56         // by the underlying API (e.g. a managed buffer in Metal). SynchronizeToCpuTask
57         // automatically handles this for us.
58         recorder->priv().add(SynchronizeToCpuTask::Make(sk_ref_sp(buffer)));
59         return sk_ref_sp(buffer);
60     }
61 
62     // The backend requires a transfer buffer for CPU read-back
63     auto xferBuffer =
64             recorder->priv().resourceProvider()->findOrCreateBuffer(buffer->size(),
65                                                                     BufferType::kXferGpuToCpu,
66                                                                     AccessPattern::kHostVisible,
67                                                                     "ComputeTest_TransferToCpu");
68     SkASSERT(xferBuffer);
69 
70     recorder->priv().add(CopyBufferToBufferTask::Make(buffer,
71                                                       /*srcOffset=*/0,
72                                                       xferBuffer,
73                                                       /*dstOffset=*/0,
74                                                       buffer->size()));
75     return xferBuffer;
76 }
77 
submit_recording(Context * context,GraphiteTestContext * testContext,Recorder * recorder)78 std::unique_ptr<Recording> submit_recording(Context* context,
79                                             GraphiteTestContext* testContext,
80                                             Recorder* recorder) {
81     std::unique_ptr<Recording> recording = recorder->snap();
82     if (!recording) {
83         return nullptr;
84     }
85 
86     InsertRecordingInfo insertInfo;
87     insertInfo.fRecording = recording.get();
88     context->insertRecording(insertInfo);
89     testContext->syncedSubmit(context);
90 
91     return recording;
92 }
93 
is_dawn_or_metal_context_type(skiatest::GpuContextType ctxType)94 bool is_dawn_or_metal_context_type(skiatest::GpuContextType ctxType) {
95     return skiatest::IsDawnContextType(ctxType) || skiatest::IsMetalContextType(ctxType);
96 }
97 
98 }  // namespace
99 
100 #define DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(            \
101         name, reporter, graphite_context, test_context)           \
102     DEF_GRAPHITE_TEST_FOR_CONTEXTS(name,                          \
103                                    is_dawn_or_metal_context_type, \
104                                    reporter,                      \
105                                    graphite_context,              \
106                                    test_context,                  \
107                                    CtsEnforcement::kNever)
108 
109 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
110 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SingleDispatchTest,reporter,context,testContext)111 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SingleDispatchTest,
112                                               reporter,
113                                               context,
114                                               testContext) {
115     constexpr uint32_t kProblemSize = 512;
116     constexpr float kFactor = 4.f;
117 
118     // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
119     // processes 1 vector at a time.
120     constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
121 
122     std::unique_ptr<Recorder> recorder = context->makeRecorder();
123 
124     class TestComputeStep : public ComputeStep {
125     public:
126         // TODO(skia:40045541): SkSL doesn't support std430 layout well, so the buffers
127         // below all pack their data into vectors to be compatible with SPIR-V/WGSL.
128         TestComputeStep() : ComputeStep(
129                 /*name=*/"TestArrayMultiply",
130                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
131                 /*resources=*/{
132                     // Input buffer:
133                     {
134                         // TODO(b/299979165): Declare this binding as read-only.
135                         /*type=*/ResourceType::kStorageBuffer,
136                         /*flow=*/DataFlow::kPrivate,
137                         /*policy=*/ResourcePolicy::kMapped,
138                         /*sksl=*/"inputBlock {\n"
139                             "    float factor;\n"
140                             "    layout(offset=16) float4 in_data[];\n"
141                             "}",
142                     },
143                     // Output buffer:
144                     {
145                         /*type=*/ResourceType::kStorageBuffer,
146                         /*flow=*/DataFlow::kShared,  // shared to allow us to access it from the
147                                                      // Builder
148                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
149                         /*slot=*/0,
150                         /*sksl=*/"outputBlock { float4 out_data[]; }",
151                     }
152                 }) {}
153         ~TestComputeStep() override = default;
154 
155         // A kernel that multiplies a large array of floats by a supplied factor.
156         std::string computeSkSL() const override {
157             return R"(
158                 void main() {
159                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
160                 }
161             )";
162         }
163 
164         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
165             if (index == 0) {
166                 SkASSERT(r.fFlow == DataFlow::kPrivate);
167                 return sizeof(float) * (kProblemSize + 4);
168             }
169             SkASSERT(index == 1);
170             SkASSERT(r.fSlot == 0);
171             SkASSERT(r.fFlow == DataFlow::kShared);
172             return sizeof(float) * kProblemSize;
173         }
174 
175         void prepareStorageBuffer(int resourceIndex,
176                                   const ResourceDesc& r,
177                                   void* buffer,
178                                   size_t bufferSize) const override {
179             // Only initialize the input buffer.
180             if (resourceIndex != 0) {
181                 return;
182             }
183             SkASSERT(r.fFlow == DataFlow::kPrivate);
184 
185             size_t dataCount = sizeof(float) * (kProblemSize + 4);
186             SkASSERT(bufferSize == dataCount);
187             SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
188             inData[0] = kFactor;
189             for (unsigned int i = 0; i < kProblemSize; ++i) {
190                 inData[i + 4] = i + 1;
191             }
192         }
193 
194         WorkgroupSize calculateGlobalDispatchSize() const override {
195             return WorkgroupSize(1, 1, 1);
196         }
197     } step;
198 
199     DispatchGroup::Builder builder(recorder.get());
200     if (!builder.appendStep(&step)) {
201         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
202         return;
203     }
204 
205     // The output buffer should have been placed in the right output slot.
206     BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
207     if (!outputInfo) {
208         ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
209         return;
210     }
211 
212     // Record the compute task
213     ComputeTask::DispatchGroupList groups;
214     groups.push_back(builder.finalize());
215     recorder->priv().add(ComputeTask::Make(std::move(groups)));
216 
217     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
218     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
219 
220     // Submit the work and wait for it to complete.
221     std::unique_ptr<Recording> recording = recorder->snap();
222     if (!recording) {
223         ERRORF(reporter, "Failed to make recording");
224         return;
225     }
226 
227     InsertRecordingInfo insertInfo;
228     insertInfo.fRecording = recording.get();
229     context->insertRecording(insertInfo);
230     testContext->syncedSubmit(context);
231 
232     // Verify the contents of the output buffer.
233     float* outData = static_cast<float*>(
234             map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
235     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
236     for (unsigned int i = 0; i < kProblemSize; ++i) {
237         const float expected = (i + 1) * kFactor;
238         const float found = outData[i];
239         REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
240     }
241 }
242 
243 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
244 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_DispatchGroupTest,reporter,context,testContext)245 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_DispatchGroupTest,
246                                               reporter,
247                                               context,
248                                               testContext) {
249     // TODO(b/315834710): This fails on Dawn D3D11
250     if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
251         return;
252     }
253 
254     constexpr uint32_t kProblemSize = 512;
255     constexpr float kFactor1 = 4.f;
256     constexpr float kFactor2 = 3.f;
257 
258     // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
259     // processes 1 vector at a time.
260     constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
261 
262     std::unique_ptr<Recorder> recorder = context->makeRecorder();
263 
264     // Define two steps that perform two multiplication passes over the same input.
265 
266     class TestComputeStep1 : public ComputeStep {
267     public:
268         // TODO(skia:40045541): SkSL doesn't support std430 layout well, so the buffers
269         // below all pack their data into vectors to be compatible with SPIR-V/WGSL.
270         TestComputeStep1() : ComputeStep(
271                 /*name=*/"TestArrayMultiplyFirstPass",
272                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
273                 /*resources=*/{
274                     // Input buffer:
275                     {
276                         // TODO(b/299979165): Declare this binding as read-only.
277                         /*type=*/ResourceType::kStorageBuffer,
278                         /*flow=*/DataFlow::kPrivate,
279                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
280                         /*sksl=*/"inputBlock {\n"
281                             "    float factor;\n"
282                             "    layout(offset=16) float4 in_data[];\n"
283                             "}",
284                     },
285                     // Output buffers:
286                     {
287                         /*type=*/ResourceType::kStorageBuffer,
288                         /*flow=*/DataFlow::kShared,
289                         /*policy=*/ResourcePolicy::kNone,  // GPU-only, read by second step
290                         /*slot=*/0,
291                         /*sksl=*/"outputBlock1 { float4 forward_data[]; }",
292                     },
293                     {
294                         /*type=*/ResourceType::kStorageBuffer,
295                         /*flow=*/DataFlow::kShared,
296                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
297                         /*slot=*/1,
298                         /*sksl=*/"outputBlock2 { float2 extra_data; }",
299                     }
300                 }) {}
301         ~TestComputeStep1() override = default;
302 
303         // A kernel that multiplies a large array of floats by a supplied factor.
304         std::string computeSkSL() const override {
305             return R"(
306                 void main() {
307                     uint idx = sk_GlobalInvocationID.x;
308                     forward_data[idx] = in_data[idx] * factor;
309                     if (idx == 0) {
310                         extra_data.x = factor;
311                         extra_data.y = 2 * factor;
312                     }
313                 }
314             )";
315         }
316 
317         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
318             if (index == 0) {
319                 SkASSERT(r.fFlow == DataFlow::kPrivate);
320                 return sizeof(float) * (kProblemSize + 4);
321             }
322             if (index == 1) {
323                 SkASSERT(r.fFlow == DataFlow::kShared);
324                 SkASSERT(r.fSlot == 0);
325                 return sizeof(float) * kProblemSize;
326             }
327 
328             SkASSERT(index == 2);
329             SkASSERT(r.fSlot == 1);
330             SkASSERT(r.fFlow == DataFlow::kShared);
331             return 2 * sizeof(float);
332         }
333 
334         void prepareStorageBuffer(int resourceIndex,
335                                   const ResourceDesc& r,
336                                   void* buffer,
337                                   size_t bufferSize) const override {
338             if (resourceIndex != 0) {
339                 return;
340             }
341 
342             size_t dataCount = sizeof(float) * (kProblemSize + 4);
343             SkASSERT(bufferSize == dataCount);
344             SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
345             inData[0] = kFactor1;
346             for (unsigned int i = 0; i < kProblemSize; ++i) {
347                 inData[i + 4] = i + 1;
348             }
349         }
350 
351         WorkgroupSize calculateGlobalDispatchSize() const override {
352             return WorkgroupSize(1, 1, 1);
353         }
354     } step1;
355 
356     class TestComputeStep2 : public ComputeStep {
357     public:
358         TestComputeStep2() : ComputeStep(
359                 /*name=*/"TestArrayMultiplySecondPass",
360                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
361                 /*resources=*/{
362                     // Input buffer:
363                     {
364                         /*type=*/ResourceType::kStorageBuffer,
365                         /*flow=*/DataFlow::kShared,
366                         /*policy=*/ResourcePolicy::kNone,  // GPU-only
367                         /*slot=*/0, // this is the output from the first step
368                         /*sksl=*/"inputBlock { float4 in_data[]; }",
369                     },
370                     {
371                         /*type=*/ResourceType::kStorageBuffer,
372                         /*flow=*/DataFlow::kPrivate,
373                         /*policy=*/ResourcePolicy::kMapped,
374                         /*sksl=*/"factorBlock { float factor; }"
375                     },
376                     // Output buffer:
377                     {
378                         /*type=*/ResourceType::kStorageBuffer,
379                         /*flow=*/DataFlow::kShared,
380                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
381                         /*slot=*/2,
382                         /*sksl=*/"outputBlock { float4 out_data[]; }",
383                     }
384                 }) {}
385         ~TestComputeStep2() override = default;
386 
387         // A kernel that multiplies a large array of floats by a supplied factor.
388         std::string computeSkSL() const override {
389             return R"(
390                 void main() {
391                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
392                 }
393             )";
394         }
395 
396         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
397             SkASSERT(index != 0);
398             if (index == 1) {
399                 SkASSERT(r.fFlow == DataFlow::kPrivate);
400                 return sizeof(float) * 4;
401             }
402             SkASSERT(index == 2);
403             SkASSERT(r.fSlot == 2);
404             SkASSERT(r.fFlow == DataFlow::kShared);
405             return sizeof(float) * kProblemSize;
406         }
407 
408         void prepareStorageBuffer(int resourceIndex,
409                                   const ResourceDesc& r,
410                                   void* buffer,
411                                   size_t bufferSize) const override {
412             if (resourceIndex != 1) {
413                 return;
414             }
415             SkASSERT(r.fFlow == DataFlow::kPrivate);
416             *static_cast<float*>(buffer) = kFactor2;
417         }
418 
419         WorkgroupSize calculateGlobalDispatchSize() const override {
420             return WorkgroupSize(1, 1, 1);
421         }
422     } step2;
423 
424     DispatchGroup::Builder builder(recorder.get());
425     builder.appendStep(&step1);
426     builder.appendStep(&step2);
427 
428     // Slots 0, 1, and 2 should all contain shared buffers. Slot 1 contains the extra output buffer
429     // from step 1 while slot 2 contains the result of the second multiplication pass from step 1.
430     // Slot 0 is not mappable.
431     REPORTER_ASSERT(reporter,
432                     std::holds_alternative<BindBufferInfo>(builder.outputTable().fSharedSlots[0]),
433                     "shared resource at slot 0 is missing");
434     BindBufferInfo outputInfo = builder.getSharedBufferResource(2);
435     if (!outputInfo) {
436         ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
437         return;
438     }
439 
440     // Extra output buffer from step 1 (corresponding to 'outputBlock2')
441     BindBufferInfo extraOutputInfo = builder.getSharedBufferResource(1);
442     if (!extraOutputInfo) {
443         ERRORF(reporter, "shared resource at slot 1 is missing");
444         return;
445     }
446 
447     // Record the compute task
448     ComputeTask::DispatchGroupList groups;
449     groups.push_back(builder.finalize());
450     recorder->priv().add(ComputeTask::Make(std::move(groups)));
451 
452     // Ensure the output buffers get synchronized to the CPU once the GPU submission has finished.
453     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
454     auto extraOutputBuffer = sync_buffer_to_cpu(recorder.get(), extraOutputInfo.fBuffer);
455 
456     // Submit the work and wait for it to complete.
457     std::unique_ptr<Recording> recording = recorder->snap();
458     if (!recording) {
459         ERRORF(reporter, "Failed to make recording");
460         return;
461     }
462 
463     InsertRecordingInfo insertInfo;
464     insertInfo.fRecording = recording.get();
465     context->insertRecording(insertInfo);
466     testContext->syncedSubmit(context);
467 
468     // Verify the contents of the output buffer from step 2
469     float* outData = static_cast<float*>(
470             map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
471     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
472     for (unsigned int i = 0; i < kProblemSize; ++i) {
473         const float expected = (i + 1) * kFactor1 * kFactor2;
474         const float found = outData[i];
475         REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
476     }
477 
478     // Verify the contents of the extra output buffer from step 1
479     float* extraOutData = static_cast<float*>(
480             map_buffer(context, testContext, extraOutputBuffer.get(), extraOutputInfo.fOffset));
481     SkASSERT(extraOutputBuffer->isMapped() && extraOutData != nullptr);
482     REPORTER_ASSERT(reporter,
483                     kFactor1 == extraOutData[0],
484                     "expected '%f', found '%f'",
485                     kFactor1,
486                     extraOutData[0]);
487     REPORTER_ASSERT(reporter,
488                     2 * kFactor1 == extraOutData[1],
489                     "expected '%f', found '%f'",
490                     2 * kFactor2,
491                     extraOutData[1]);
492 }
493 
494 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
495 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_UniformBufferTest,reporter,context,testContext)496 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_UniformBufferTest,
497                                               reporter,
498                                               context,
499                                               testContext) {
500     // TODO(b/315834710): This fails on Dawn D3D11
501     if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
502         return;
503     }
504 
505     constexpr uint32_t kProblemSize = 512;
506     constexpr float kFactor = 4.f;
507 
508     // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
509     // processes 1 vector at a time.
510     constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
511 
512     std::unique_ptr<Recorder> recorder = context->makeRecorder();
513 
514     class TestComputeStep : public ComputeStep {
515     public:
516         TestComputeStep() : ComputeStep(
517                 /*name=*/"TestArrayMultiply",
518                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
519                 /*resources=*/{
520                     // Uniform buffer:
521                     {
522                         /*type=*/ResourceType::kUniformBuffer,
523                         /*flow=*/DataFlow::kPrivate,
524                         /*policy=*/ResourcePolicy::kMapped,
525                         /*sksl=*/"uniformBlock { float factor; }"
526                     },
527                     // Input buffer:
528                     {
529                         /*type=*/ResourceType::kStorageBuffer,
530                         /*flow=*/DataFlow::kPrivate,
531                         /*policy=*/ResourcePolicy::kMapped,
532                         /*sksl=*/"inputBlock { float4 in_data[]; }",
533                     },
534                     // Output buffer:
535                     {
536                         /*type=*/ResourceType::kStorageBuffer,
537                         /*flow=*/DataFlow::kShared,  // shared to allow us to access it from the
538                                                      // Builder
539                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
540                         /*slot=*/0,
541                         /*sksl=*/"outputBlock { float4 out_data[]; }",
542                     }
543                 }) {}
544         ~TestComputeStep() override = default;
545 
546         // A kernel that multiplies a large array of floats by a supplied factor.
547         std::string computeSkSL() const override {
548             return R"(
549                 void main() {
550                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
551                 }
552             )";
553         }
554 
555         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
556             if (index == 0) {
557                 SkASSERT(r.fFlow == DataFlow::kPrivate);
558                 return sizeof(float);
559             }
560             if (index == 1) {
561                 SkASSERT(r.fFlow == DataFlow::kPrivate);
562                 return sizeof(float) * kProblemSize;
563             }
564             SkASSERT(index == 2);
565             SkASSERT(r.fSlot == 0);
566             SkASSERT(r.fFlow == DataFlow::kShared);
567             return sizeof(float) * kProblemSize;
568         }
569 
570         void prepareStorageBuffer(int resourceIndex,
571                                   const ResourceDesc& r,
572                                   void* buffer,
573                                   size_t bufferSize) const override {
574             // Only initialize the input storage buffer.
575             if (resourceIndex != 1) {
576                 return;
577             }
578             SkASSERT(r.fFlow == DataFlow::kPrivate);
579             size_t dataCount = sizeof(float) * kProblemSize;
580             SkASSERT(bufferSize == dataCount);
581             SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
582             for (unsigned int i = 0; i < kProblemSize; ++i) {
583                 inData[i] = i + 1;
584             }
585         }
586 
587         void prepareUniformBuffer(int resourceIndex,
588                                   const ResourceDesc&,
589                                   UniformManager* mgr) const override {
590             SkASSERT(resourceIndex == 0);
591             SkDEBUGCODE(
592                 const Uniform uniforms[] = {{"factor", SkSLType::kFloat}};
593                 mgr->setExpectedUniforms(uniforms, /*isSubstruct=*/false);
594             )
595             mgr->write(kFactor);
596         }
597 
598         WorkgroupSize calculateGlobalDispatchSize() const override {
599             return WorkgroupSize(1, 1, 1);
600         }
601     } step;
602 
603     DispatchGroup::Builder builder(recorder.get());
604     if (!builder.appendStep(&step)) {
605         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
606         return;
607     }
608 
609     // The output buffer should have been placed in the right output slot.
610     BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
611     if (!outputInfo) {
612         ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
613         return;
614     }
615 
616     // Record the compute task
617     ComputeTask::DispatchGroupList groups;
618     groups.push_back(builder.finalize());
619     recorder->priv().add(ComputeTask::Make(std::move(groups)));
620 
621     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
622     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
623 
624     // Submit the work and wait for it to complete.
625     std::unique_ptr<Recording> recording = recorder->snap();
626     if (!recording) {
627         ERRORF(reporter, "Failed to make recording");
628         return;
629     }
630 
631     InsertRecordingInfo insertInfo;
632     insertInfo.fRecording = recording.get();
633     context->insertRecording(insertInfo);
634     testContext->syncedSubmit(context);
635 
636     // Verify the contents of the output buffer.
637     float* outData = static_cast<float*>(
638             map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
639     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
640     for (unsigned int i = 0; i < kProblemSize; ++i) {
641         const float expected = (i + 1) * kFactor;
642         const float found = outData[i];
643         REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
644     }
645 }
646 
647 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
648 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ExternallyAssignedBuffer,reporter,context,testContext)649 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ExternallyAssignedBuffer,
650                                               reporter,
651                                               context,
652                                               testContext) {
653     constexpr uint32_t kProblemSize = 512;
654     constexpr float kFactor = 4.f;
655 
656     // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
657     // processes 1 vector at a time.
658     constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
659 
660     std::unique_ptr<Recorder> recorder = context->makeRecorder();
661 
662     class TestComputeStep : public ComputeStep {
663     public:
664         TestComputeStep() : ComputeStep(
665                 /*name=*/"ExternallyAssignedBuffer",
666                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
667                 /*resources=*/{
668                     // Input buffer:
669                     {
670                         /*type=*/ResourceType::kStorageBuffer,
671                         /*flow=*/DataFlow::kPrivate,
672                         /*policy=*/ResourcePolicy::kMapped,
673                         /*sksl=*/"inputBlock {\n"
674                                  "    float factor;\n"
675                                  "    layout(offset = 16) float4 in_data[];\n"
676                                  "}\n",
677                     },
678                     // Output buffer:
679                     {
680                         /*type=*/ResourceType::kStorageBuffer,
681                         /*flow=*/DataFlow::kShared,  // shared to allow us to access it from the
682                                                      // Builder
683                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
684                         /*slot=*/0,
685                         /*sksl=*/"outputBlock { float4 out_data[]; }",
686                     }
687                 }) {}
688         ~TestComputeStep() override = default;
689 
690         // A kernel that multiplies a large array of floats by a supplied factor.
691         std::string computeSkSL() const override {
692             return R"(
693                 void main() {
694                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
695                 }
696             )";
697         }
698 
699         size_t calculateBufferSize(int resourceIndex, const ResourceDesc& r) const override {
700             SkASSERT(resourceIndex == 0);
701             SkASSERT(r.fFlow == DataFlow::kPrivate);
702             return sizeof(float) * (kProblemSize + 4);
703         }
704 
705         void prepareStorageBuffer(int resourceIndex,
706                                   const ResourceDesc& r,
707                                   void* buffer,
708                                   size_t bufferSize) const override {
709             SkASSERT(resourceIndex == 0);
710             SkASSERT(r.fFlow == DataFlow::kPrivate);
711 
712             size_t dataCount = sizeof(float) * (kProblemSize + 4);
713             SkASSERT(bufferSize == dataCount);
714             SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
715             inData[0] = kFactor;
716             for (unsigned int i = 0; i < kProblemSize; ++i) {
717                 inData[i + 4] = i + 1;
718             }
719         }
720     } step;
721 
722     // We allocate a buffer and directly assign it to the DispatchGroup::Builder. The ComputeStep
723     // will not participate in the creation of this buffer.
724     auto [_, outputInfo] =
725             recorder->priv().drawBufferManager()->getStoragePointer(sizeof(float) * kProblemSize);
726     REPORTER_ASSERT(reporter, outputInfo, "Failed to allocate output buffer");
727 
728     DispatchGroup::Builder builder(recorder.get());
729     builder.assignSharedBuffer(outputInfo, 0);
730 
731     // Initialize the step with a pre-determined global size
732     if (!builder.appendStep(&step, {WorkgroupSize(1, 1, 1)})) {
733         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
734         return;
735     }
736 
737     // Record the compute task
738     ComputeTask::DispatchGroupList groups;
739     groups.push_back(builder.finalize());
740     recorder->priv().add(ComputeTask::Make(std::move(groups)));
741 
742     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
743     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
744 
745     // Submit the work and wait for it to complete.
746     std::unique_ptr<Recording> recording = recorder->snap();
747     if (!recording) {
748         ERRORF(reporter, "Failed to make recording");
749         return;
750     }
751 
752     InsertRecordingInfo insertInfo;
753     insertInfo.fRecording = recording.get();
754     context->insertRecording(insertInfo);
755     testContext->syncedSubmit(context);
756 
757     // Verify the contents of the output buffer.
758     float* outData = static_cast<float*>(
759             map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
760     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
761     for (unsigned int i = 0; i < kProblemSize; ++i) {
762         const float expected = (i + 1) * kFactor;
763         const float found = outData[i];
764         REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
765     }
766 }
767 
768 // Tests the storage texture binding for a compute dispatch that writes the same color to every
769 // pixel of a storage texture.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTexture,reporter,context,testContext)770 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTexture,
771                                               reporter,
772                                               context,
773                                               testContext) {
774     std::unique_ptr<Recorder> recorder = context->makeRecorder();
775 
776     // For this test we allocate a 8x8 tile which is written to by a single workgroup of the same
777     // size.
778     constexpr uint32_t kDim = 8;
779 
780     class TestComputeStep : public ComputeStep {
781     public:
782         TestComputeStep() : ComputeStep(
783                 /*name=*/"TestStorageTexture",
784                 /*localDispatchSize=*/{kDim, kDim, 1},
785                 /*resources=*/{
786                     {
787                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
788                         /*flow=*/DataFlow::kShared,
789                         /*policy=*/ResourcePolicy::kNone,
790                         /*slot=*/0,
791                         /*sksl=*/"dst",
792                     }
793                 }) {}
794         ~TestComputeStep() override = default;
795 
796         std::string computeSkSL() const override {
797             return R"(
798                 void main() {
799                     textureWrite(dst, sk_LocalInvocationID.xy, half4(0.0, 1.0, 0.0, 1.0));
800                 }
801             )";
802         }
803 
804         std::tuple<SkISize, SkColorType> calculateTextureParameters(
805                 int index, const ResourceDesc& r) const override {
806             return {{kDim, kDim}, kRGBA_8888_SkColorType};
807         }
808 
809         WorkgroupSize calculateGlobalDispatchSize() const override {
810             return WorkgroupSize(1, 1, 1);
811         }
812     } step;
813 
814     DispatchGroup::Builder builder(recorder.get());
815     if (!builder.appendStep(&step)) {
816         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
817         return;
818     }
819 
820     sk_sp<TextureProxy> texture = builder.getSharedTextureResource(0);
821     if (!texture) {
822         ERRORF(reporter, "Shared resource at slot 0 is missing");
823         return;
824     }
825 
826     // Record the compute task
827     ComputeTask::DispatchGroupList groups;
828     groups.push_back(builder.finalize());
829     recorder->priv().add(ComputeTask::Make(std::move(groups)));
830 
831     // Submit the work and wait for it to complete.
832     std::unique_ptr<Recording> recording = recorder->snap();
833     if (!recording) {
834         ERRORF(reporter, "Failed to make recording");
835         return;
836     }
837 
838     InsertRecordingInfo insertInfo;
839     insertInfo.fRecording = recording.get();
840     context->insertRecording(insertInfo);
841     testContext->syncedSubmit(context);
842 
843     SkBitmap bitmap;
844     SkImageInfo imgInfo =
845             SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
846     bitmap.allocPixels(imgInfo);
847 
848     SkPixmap pixels;
849     bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
850     REPORTER_ASSERT(reporter, peekPixelsSuccess);
851 
852     bool readPixelsSuccess = context->priv().readPixels(pixels, texture.get(), imgInfo, 0, 0);
853     REPORTER_ASSERT(reporter, readPixelsSuccess);
854 
855     for (uint32_t x = 0; x < kDim; ++x) {
856         for (uint32_t y = 0; y < kDim; ++y) {
857             SkColor4f expected = SkColor4f::FromColor(SK_ColorGREEN);
858             SkColor4f color = pixels.getColor4f(x, y);
859             REPORTER_ASSERT(reporter, expected == color,
860                             "At position {%u, %u}, "
861                             "expected {%.1f, %.1f, %.1f, %.1f}, "
862                             "found {%.1f, %.1f, %.1f, %.1f}",
863                             x, y,
864                             expected.fR, expected.fG, expected.fB, expected.fA,
865                             color.fR, color.fG, color.fB, color.fA);
866         }
867     }
868 }
869 
870 // Tests the readonly texture binding for a compute dispatch that random-access reads from a
871 // CPU-populated texture and copies it to a storage texture.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureReadAndWrite,reporter,context,testContext)872 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureReadAndWrite,
873                                               reporter,
874                                               context,
875                                               testContext) {
876     std::unique_ptr<Recorder> recorder = context->makeRecorder();
877 
878     // For this test we allocate a 8x8 tile which is written to by a single workgroup of the same
879     // size.
880     constexpr uint32_t kDim = 8;
881 
882     class TestComputeStep : public ComputeStep {
883     public:
884         TestComputeStep() : ComputeStep(
885                 /*name=*/"TestStorageTextureReadAndWrite",
886                 /*localDispatchSize=*/{kDim, kDim, 1},
887                 /*resources=*/{
888                     {
889                         /*type=*/ResourceType::kReadOnlyTexture,
890                         /*flow=*/DataFlow::kShared,
891                         /*policy=*/ResourcePolicy::kNone,
892                         /*slot=*/0,
893                         /*sksl=*/"src",
894                     },
895                     {
896                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
897                         /*flow=*/DataFlow::kShared,
898                         /*policy=*/ResourcePolicy::kNone,
899                         /*slot=*/1,
900                         /*sksl=*/"dst",
901                     }
902                 }) {}
903         ~TestComputeStep() override = default;
904 
905         std::string computeSkSL() const override {
906             return R"(
907                 void main() {
908                     half4 color = textureRead(src, sk_LocalInvocationID.xy);
909                     textureWrite(dst, sk_LocalInvocationID.xy, color);
910                 }
911             )";
912         }
913 
914         std::tuple<SkISize, SkColorType> calculateTextureParameters(
915                 int index, const ResourceDesc& r) const override {
916             SkASSERT(index == 1);
917             return {{kDim, kDim}, kRGBA_8888_SkColorType};
918         }
919 
920         WorkgroupSize calculateGlobalDispatchSize() const override {
921             return WorkgroupSize(1, 1, 1);
922         }
923     } step;
924 
925     // Create and populate an input texture.
926     SkBitmap srcBitmap;
927     SkImageInfo srcInfo =
928             SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
929     srcBitmap.allocPixels(srcInfo);
930     SkPixmap srcPixels;
931     bool srcPeekPixelsSuccess = srcBitmap.peekPixels(&srcPixels);
932     REPORTER_ASSERT(reporter, srcPeekPixelsSuccess);
933     for (uint32_t x = 0; x < kDim; ++x) {
934         for (uint32_t y = 0; y < kDim; ++y) {
935             *srcPixels.writable_addr32(x, y) =
936                     SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0);
937         }
938     }
939 
940     auto texInfo = context->priv().caps()->getDefaultSampledTextureInfo(kRGBA_8888_SkColorType,
941                                                                         skgpu::Mipmapped::kNo,
942                                                                         skgpu::Protected::kNo,
943                                                                         skgpu::Renderable::kNo);
944     sk_sp<TextureProxy> srcProxy = TextureProxy::Make(context->priv().caps(),
945                                                       recorder->priv().resourceProvider(),
946                                                       {kDim, kDim},
947                                                       texInfo,
948                                                       "ComputeTestSrcProxy",
949                                                       skgpu::Budgeted::kNo);
950     MipLevel mipLevel;
951     mipLevel.fPixels = srcPixels.addr();
952     mipLevel.fRowBytes = srcPixels.rowBytes();
953     UploadInstance upload = UploadInstance::Make(recorder.get(),
954                                                  srcProxy,
955                                                  srcPixels.info().colorInfo(),
956                                                  srcPixels.info().colorInfo(),
957                                                  {mipLevel},
958                                                  SkIRect::MakeWH(kDim, kDim),
959                                                  std::make_unique<ImageUploadContext>());
960     if (!upload.isValid()) {
961         ERRORF(reporter, "Could not create UploadInstance");
962         return;
963     }
964     recorder->priv().add(UploadTask::Make(std::move(upload)));
965 
966     DispatchGroup::Builder builder(recorder.get());
967 
968     // Assign the input texture to slot 0. This corresponds to the ComputeStep's "src" texture
969     // binding.
970     builder.assignSharedTexture(std::move(srcProxy), 0);
971 
972     if (!builder.appendStep(&step)) {
973         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
974         return;
975     }
976 
977     sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
978     if (!dst) {
979         ERRORF(reporter, "shared resource at slot 1 is missing");
980         return;
981     }
982 
983     // Record the compute task
984     ComputeTask::DispatchGroupList groups;
985     groups.push_back(builder.finalize());
986     recorder->priv().add(ComputeTask::Make(std::move(groups)));
987 
988     // Submit the work and wait for it to complete.
989     std::unique_ptr<Recording> recording = recorder->snap();
990     if (!recording) {
991         ERRORF(reporter, "Failed to make recording");
992         return;
993     }
994 
995     InsertRecordingInfo insertInfo;
996     insertInfo.fRecording = recording.get();
997     context->insertRecording(insertInfo);
998     testContext->syncedSubmit(context);
999 
1000     SkBitmap bitmap;
1001     SkImageInfo imgInfo =
1002             SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1003     bitmap.allocPixels(imgInfo);
1004 
1005     SkPixmap pixels;
1006     bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1007     REPORTER_ASSERT(reporter, peekPixelsSuccess);
1008 
1009     bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1010     REPORTER_ASSERT(reporter, readPixelsSuccess);
1011 
1012     for (uint32_t x = 0; x < kDim; ++x) {
1013         for (uint32_t y = 0; y < kDim; ++y) {
1014             SkColor4f expected = SkColor4f::FromBytes_RGBA(
1015                     SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0));
1016             SkColor4f color = pixels.getColor4f(x, y);
1017             REPORTER_ASSERT(reporter, expected == color,
1018                             "At position {%u, %u}, "
1019                             "expected {%.1f, %.1f, %.1f, %.1f}, "
1020                             "found {%.1f, %.1f, %.1f, %.1f}",
1021                             x, y,
1022                             expected.fR, expected.fG, expected.fB, expected.fA,
1023                             color.fR, color.fG, color.fB, color.fA);
1024         }
1025     }
1026 }
1027 
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ReadOnlyStorageBuffer,reporter,context,testContext)1028 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ReadOnlyStorageBuffer,
1029                                               reporter,
1030                                               context,
1031                                               testContext) {
1032     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1033 
1034     // For this test we allocate a 8x8 tile which is written to by a single workgroup of the same
1035     // size.
1036     constexpr uint32_t kDim = 8;
1037 
1038     class TestComputeStep : public ComputeStep {
1039     public:
1040         TestComputeStep() : ComputeStep(
1041                 /*name=*/"TestReadOnlyStorageBuffer",
1042                 /*localDispatchSize=*/{kDim, kDim, 1},
1043                 /*resources=*/{
1044                     {
1045                         /*type=*/ResourceType::kReadOnlyStorageBuffer,
1046                         /*flow=*/DataFlow::kShared,
1047                         /*policy=*/ResourcePolicy::kMapped,
1048                         /*slot=*/0,
1049                         /*sksl=*/"src { uint in_data[]; }",
1050                     },
1051                     {
1052                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
1053                         /*flow=*/DataFlow::kShared,
1054                         /*policy=*/ResourcePolicy::kNone,
1055                         /*slot=*/1,
1056                         /*sksl=*/"dst",
1057                     }
1058                 }) {}
1059         ~TestComputeStep() override = default;
1060 
1061         std::string computeSkSL() const override {
1062             return R"(
1063                 void main() {
1064                     uint ix = sk_LocalInvocationID.y * 8 + sk_LocalInvocationID.x;
1065                     uint value = in_data[ix];
1066                     half4 splat = half4(
1067                         half(value & 0xFF),
1068                         half((value >> 8) & 0xFF),
1069                         half((value >> 16) & 0xFF),
1070                         half((value >> 24) & 0xFF)
1071                     );
1072                     textureWrite(dst, sk_LocalInvocationID.xy, splat / 255.0);
1073                 }
1074             )";
1075         }
1076 
1077         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1078             SkASSERT(index == 0);
1079             return kDim * kDim * sizeof(uint32_t);
1080         }
1081 
1082         void prepareStorageBuffer(int index,
1083                                   const ResourceDesc&,
1084                                   void* buffer,
1085                                   size_t bufferSize) const override {
1086             SkASSERT(index == 0);
1087             SkASSERT(bufferSize == kDim * kDim * sizeof(uint32_t));
1088 
1089             uint32_t* inputs = reinterpret_cast<uint32_t*>(buffer);
1090             for (uint32_t y = 0; y < kDim; ++y) {
1091                 for (uint32_t x = 0; x < kDim; ++x) {
1092                     uint32_t value =
1093                             ((x * 256 / kDim) & 0xFF) | ((y * 256 / kDim) & 0xFF) << 8 | 255 << 24;
1094                     *(inputs++) = value;
1095                 }
1096             }
1097         }
1098 
1099         std::tuple<SkISize, SkColorType> calculateTextureParameters(
1100                 int index, const ResourceDesc& r) const override {
1101             SkASSERT(index == 1);
1102             return {{kDim, kDim}, kRGBA_8888_SkColorType};
1103         }
1104 
1105         WorkgroupSize calculateGlobalDispatchSize() const override {
1106             return WorkgroupSize(1, 1, 1);
1107         }
1108     } step;
1109 
1110     DispatchGroup::Builder builder(recorder.get());
1111     if (!builder.appendStep(&step)) {
1112         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
1113         return;
1114     }
1115 
1116     sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
1117     if (!dst) {
1118         ERRORF(reporter, "shared resource at slot 1 is missing");
1119         return;
1120     }
1121 
1122     // Record the compute task
1123     ComputeTask::DispatchGroupList groups;
1124     groups.push_back(builder.finalize());
1125     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1126 
1127     // Submit the work and wait for it to complete.
1128     std::unique_ptr<Recording> recording = recorder->snap();
1129     if (!recording) {
1130         ERRORF(reporter, "Failed to make recording");
1131         return;
1132     }
1133 
1134     InsertRecordingInfo insertInfo;
1135     insertInfo.fRecording = recording.get();
1136     context->insertRecording(insertInfo);
1137     testContext->syncedSubmit(context);
1138 
1139     SkBitmap bitmap;
1140     SkImageInfo imgInfo =
1141             SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1142     bitmap.allocPixels(imgInfo);
1143 
1144     SkPixmap pixels;
1145     bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1146     REPORTER_ASSERT(reporter, peekPixelsSuccess);
1147 
1148     bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1149     REPORTER_ASSERT(reporter, readPixelsSuccess);
1150 
1151     for (uint32_t x = 0; x < kDim; ++x) {
1152         for (uint32_t y = 0; y < kDim; ++y) {
1153             SkColor4f expected =
1154                     SkColor4f::FromColor(SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0));
1155             SkColor4f color = pixels.getColor4f(x, y);
1156             bool pass = true;
1157             for (int i = 0; i < 4; i++) {
1158                 pass &= color[i] == expected[i];
1159             }
1160             REPORTER_ASSERT(reporter, pass,
1161                             "At position {%u, %u}, "
1162                             "expected {%.1f, %.1f, %.1f, %.1f}, "
1163                             "found {%.1f, %.1f, %.1f, %.1f}",
1164                             x, y,
1165                             expected.fR, expected.fG, expected.fB, expected.fA,
1166                             color.fR, color.fG, color.fB, color.fA);
1167         }
1168     }
1169 }
1170 
1171 // Tests that a texture written by one compute step can be sampled by a subsequent step.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureMultipleComputeSteps,reporter,context,testContext)1172 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureMultipleComputeSteps,
1173                                               reporter,
1174                                               context,
1175                                               testContext) {
1176     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1177 
1178     // For this test we allocate a 8x8 tile which is written to by a single workgroup of the same
1179     // size.
1180     constexpr uint32_t kDim = 8;
1181 
1182     // Writes to a texture in slot 0.
1183     class TestComputeStep1 : public ComputeStep {
1184     public:
1185         TestComputeStep1() : ComputeStep(
1186                 /*name=*/"TestStorageTexturesFirstPass",
1187                 /*localDispatchSize=*/{kDim, kDim, 1},
1188                 /*resources=*/{
1189                     {
1190                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
1191                         /*flow=*/DataFlow::kShared,
1192                         /*policy=*/ResourcePolicy::kNone,
1193                         /*slot=*/0,
1194                         /*sksl=*/"dst",
1195                     }
1196                 }) {}
1197         ~TestComputeStep1() override = default;
1198 
1199         std::string computeSkSL() const override {
1200             return R"(
1201                 void main() {
1202                     textureWrite(dst, sk_LocalInvocationID.xy, half4(0.0, 1.0, 0.0, 1.0));
1203                 }
1204             )";
1205         }
1206 
1207         std::tuple<SkISize, SkColorType> calculateTextureParameters(
1208                 int index, const ResourceDesc& r) const override {
1209             SkASSERT(index == 0);
1210             return {{kDim, kDim}, kRGBA_8888_SkColorType};
1211         }
1212 
1213         WorkgroupSize calculateGlobalDispatchSize() const override {
1214             return WorkgroupSize(1, 1, 1);
1215         }
1216     } step1;
1217 
1218     // Reads from the texture in slot 0 and writes it to another texture in slot 1.
1219     class TestComputeStep2 : public ComputeStep {
1220     public:
1221         TestComputeStep2() : ComputeStep(
1222                 /*name=*/"TestStorageTexturesSecondPass",
1223                 /*localDispatchSize=*/{kDim, kDim, 1},
1224                 /*resources=*/{
1225                     {
1226                         /*type=*/ResourceType::kReadOnlyTexture,
1227                         /*flow=*/DataFlow::kShared,
1228                         /*policy=*/ResourcePolicy::kNone,
1229                         /*slot=*/0,
1230                         /*sksl=*/"src",
1231                     },
1232                     {
1233                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
1234                         /*flow=*/DataFlow::kShared,
1235                         /*policy=*/ResourcePolicy::kNone,
1236                         /*slot=*/1,
1237                         /*sksl=*/"dst",
1238                     }
1239                 }) {}
1240         ~TestComputeStep2() override = default;
1241 
1242         std::string computeSkSL() const override {
1243             return R"(
1244                 void main() {
1245                     half4 color = textureRead(src, sk_LocalInvocationID.xy);
1246                     textureWrite(dst, sk_LocalInvocationID.xy, color);
1247                 }
1248             )";
1249         }
1250 
1251         std::tuple<SkISize, SkColorType> calculateTextureParameters(
1252                 int index, const ResourceDesc& r) const override {
1253             SkASSERT(index == 1);
1254             return {{kDim, kDim}, kRGBA_8888_SkColorType};
1255         }
1256 
1257         WorkgroupSize calculateGlobalDispatchSize() const override {
1258             return WorkgroupSize(1, 1, 1);
1259         }
1260     } step2;
1261 
1262     DispatchGroup::Builder builder(recorder.get());
1263     builder.appendStep(&step1);
1264     builder.appendStep(&step2);
1265 
1266     sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
1267     if (!dst) {
1268         ERRORF(reporter, "shared resource at slot 1 is missing");
1269         return;
1270     }
1271 
1272     // Record the compute task
1273     ComputeTask::DispatchGroupList groups;
1274     groups.push_back(builder.finalize());
1275     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1276 
1277     // Submit the work and wait for it to complete.
1278     std::unique_ptr<Recording> recording = recorder->snap();
1279     if (!recording) {
1280         ERRORF(reporter, "Failed to make recording");
1281         return;
1282     }
1283 
1284     InsertRecordingInfo insertInfo;
1285     insertInfo.fRecording = recording.get();
1286     context->insertRecording(insertInfo);
1287     testContext->syncedSubmit(context);
1288 
1289     SkBitmap bitmap;
1290     SkImageInfo imgInfo =
1291             SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1292     bitmap.allocPixels(imgInfo);
1293 
1294     SkPixmap pixels;
1295     bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1296     REPORTER_ASSERT(reporter, peekPixelsSuccess);
1297 
1298     bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1299     REPORTER_ASSERT(reporter, readPixelsSuccess);
1300 
1301     for (uint32_t x = 0; x < kDim; ++x) {
1302         for (uint32_t y = 0; y < kDim; ++y) {
1303             SkColor4f expected = SkColor4f::FromColor(SK_ColorGREEN);
1304             SkColor4f color = pixels.getColor4f(x, y);
1305             REPORTER_ASSERT(reporter, expected == color,
1306                             "At position {%u, %u}, "
1307                             "expected {%.1f, %.1f, %.1f, %.1f}, "
1308                             "found {%.1f, %.1f, %.1f, %.1f}",
1309                             x, y,
1310                             expected.fR, expected.fG, expected.fB, expected.fA,
1311                             color.fR, color.fG, color.fB, color.fA);
1312         }
1313     }
1314 }
1315 
1316 // Tests that a texture can be sampled by a compute step using a sampler.
1317 // TODO(armansito): Once the previous TODO is done, add additional tests that exercise mixed use of
1318 // texture, buffer, and sampler bindings.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SampledTexture,reporter,context,testContext)1319 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SampledTexture,
1320                                               reporter,
1321                                               context,
1322                                               testContext) {
1323     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1324 
1325     // The first ComputeStep initializes a 8x8 texture with a checkerboard pattern of alternating
1326     // red and black pixels. The second ComputeStep downsamples this texture into a 4x4 using
1327     // bilinear filtering at pixel borders, intentionally averaging the values of each 4x4 tile in
1328     // the source texture, and writes the result to the destination texture.
1329     constexpr uint32_t kSrcDim = 8;
1330     constexpr uint32_t kDstDim = 4;
1331 
1332     class TestComputeStep1 : public ComputeStep {
1333     public:
1334         TestComputeStep1() : ComputeStep(
1335                 /*name=*/"Test_SampledTexture_Init",
1336                 /*localDispatchSize=*/{kSrcDim, kSrcDim, 1},
1337                 /*resources=*/{
1338                     {
1339                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
1340                         /*flow=*/DataFlow::kShared,
1341                         /*policy=*/ResourcePolicy::kNone,
1342                         /*slot=*/0,
1343                         /*sksl=*/"dst",
1344                     }
1345                 }) {}
1346         ~TestComputeStep1() override = default;
1347 
1348         std::string computeSkSL() const override {
1349             return R"(
1350                 void main() {
1351                     uint2 c = sk_LocalInvocationID.xy;
1352                     uint checkerBoardColor = (c.x + (c.y % 2)) % 2;
1353                     textureWrite(dst, c, half4(checkerBoardColor, 0, 0, 1));
1354                 }
1355             )";
1356         }
1357 
1358         std::tuple<SkISize, SkColorType> calculateTextureParameters(
1359                 int index, const ResourceDesc& r) const override {
1360             SkASSERT(index == 0);
1361             return {{kSrcDim, kSrcDim}, kRGBA_8888_SkColorType};
1362         }
1363 
1364         WorkgroupSize calculateGlobalDispatchSize() const override {
1365             return WorkgroupSize(1, 1, 1);
1366         }
1367     } step1;
1368 
1369     class TestComputeStep2 : public ComputeStep {
1370     public:
1371         TestComputeStep2() : ComputeStep(
1372                 /*name=*/"Test_SampledTexture_Sample",
1373                 /*localDispatchSize=*/{kDstDim, kDstDim, 1},
1374                 /*resources=*/{
1375                     // Declare the storage texture before the sampled texture. This tests that
1376                     // binding index assignment works consistently across all backends when a
1377                     // sampler-less texture and a texture+sampler pair are intermixed and sampler
1378                     // bindings aren't necessarily contiguous when the ranges are distinct.
1379                     {
1380                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
1381                         /*flow=*/DataFlow::kShared,
1382                         /*policy=*/ResourcePolicy::kNone,
1383                         /*slot=*/1,
1384                         /*sksl=*/"dst",
1385                     },
1386                     {
1387                         /*type=*/ResourceType::kSampledTexture,
1388                         /*flow=*/DataFlow::kShared,
1389                         /*policy=*/ResourcePolicy::kNone,
1390                         /*slot=*/0,
1391                         /*sksl=*/"src",
1392                     }
1393                 }) {}
1394         ~TestComputeStep2() override = default;
1395 
1396         std::string computeSkSL() const override {
1397             return R"(
1398                 void main() {
1399                     // Normalize the 4x4 invocation indices and sample the source texture using
1400                     // that.
1401                     uint2 dstCoord = sk_LocalInvocationID.xy;
1402                     const float2 dstSizeInv = float2(0.25, 0.25);
1403                     float2 unormCoord = float2(dstCoord) * dstSizeInv;
1404 
1405                     // Use explicit LOD, as quad derivatives are not available to a compute shader.
1406                     half4 color = sampleLod(src, unormCoord, 0);
1407                     textureWrite(dst, dstCoord, color);
1408                 }
1409             )";
1410         }
1411 
1412         std::tuple<SkISize, SkColorType> calculateTextureParameters(
1413                 int index, const ResourceDesc& r) const override {
1414             SkASSERT(index == 0 || index == 1);
1415             return {{kDstDim, kDstDim}, kRGBA_8888_SkColorType};
1416         }
1417 
1418         SamplerDesc calculateSamplerParameters(int index, const ResourceDesc&) const override {
1419             SkASSERT(index == 1);
1420             // Use the repeat tile mode to sample an infinite checkerboard.
1421             return {SkFilterMode::kLinear, SkTileMode::kRepeat};
1422         }
1423 
1424         WorkgroupSize calculateGlobalDispatchSize() const override {
1425             return WorkgroupSize(1, 1, 1);
1426         }
1427     } step2;
1428 
1429     DispatchGroup::Builder builder(recorder.get());
1430     builder.appendStep(&step1);
1431     builder.appendStep(&step2);
1432 
1433     sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
1434     if (!dst) {
1435         ERRORF(reporter, "shared resource at slot 1 is missing");
1436         return;
1437     }
1438 
1439     // Record the compute task
1440     ComputeTask::DispatchGroupList groups;
1441     groups.push_back(builder.finalize());
1442     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1443 
1444     // Submit the work and wait for it to complete.
1445     std::unique_ptr<Recording> recording = recorder->snap();
1446     if (!recording) {
1447         ERRORF(reporter, "Failed to make recording");
1448         return;
1449     }
1450 
1451     InsertRecordingInfo insertInfo;
1452     insertInfo.fRecording = recording.get();
1453     context->insertRecording(insertInfo);
1454     testContext->syncedSubmit(context);
1455 
1456     SkBitmap bitmap;
1457     SkImageInfo imgInfo =
1458             SkImageInfo::Make(kDstDim, kDstDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1459     bitmap.allocPixels(imgInfo);
1460 
1461     SkPixmap pixels;
1462     bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1463     REPORTER_ASSERT(reporter, peekPixelsSuccess);
1464 
1465     bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1466     REPORTER_ASSERT(reporter, readPixelsSuccess);
1467 
1468     for (uint32_t x = 0; x < kDstDim; ++x) {
1469         for (uint32_t y = 0; y < kDstDim; ++y) {
1470             SkColor4f color = pixels.getColor4f(x, y);
1471             REPORTER_ASSERT(reporter, color.fR > 0.49 && color.fR < 0.51,
1472                             "At position {%u, %u}, "
1473                             "expected red channel in range [0.49, 0.51], "
1474                             "found {%.3f}",
1475                             x, y, color.fR);
1476         }
1477     }
1478 }
1479 
1480 // TODO(b/260622403): The shader tested here is identical to
1481 // `resources/sksl/compute/AtomicsOperations.compute`. It would be nice to be able to exercise SkSL
1482 // features like this as part of SkSLTest.cpp instead of as a graphite test.
1483 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
1484 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsTest,reporter,context,testContext)1485 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsTest,
1486                                               reporter,
1487                                               context,
1488                                               testContext) {
1489     // This fails on Dawn D3D11, b/315834710
1490     if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
1491         return;
1492     }
1493 
1494     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1495 
1496     constexpr uint32_t kWorkgroupCount = 32;
1497     constexpr uint32_t kWorkgroupSize = 128;
1498 
1499     class TestComputeStep : public ComputeStep {
1500     public:
1501         TestComputeStep() : ComputeStep(
1502                 /*name=*/"TestAtomicOperations",
1503                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1504                 /*resources=*/{
1505                     {
1506                         /*type=*/ResourceType::kStorageBuffer,
1507                         /*flow=*/DataFlow::kShared,
1508                         /*policy=*/ResourcePolicy::kMapped,
1509                         /*slot=*/0,
1510                         /*sksl=*/"ssbo { atomicUint globalCounter; }",
1511                     }
1512                 }) {}
1513         ~TestComputeStep() override = default;
1514 
1515         // A kernel that increments a global (device memory) counter across multiple workgroups.
1516         // Each workgroup maintains its own independent tally in a workgroup-shared counter which
1517         // is then added to the global count.
1518         //
1519         // This exercises atomic store/load/add and coherent reads and writes over memory in storage
1520         // and workgroup address spaces.
1521         std::string computeSkSL() const override {
1522             return R"(
1523                 workgroup atomicUint localCounter;
1524 
1525                 void main() {
1526                     // Initialize the local counter.
1527                     if (sk_LocalInvocationID.x == 0) {
1528                         atomicStore(localCounter, 0);
1529                     }
1530 
1531                     // Synchronize the threads in the workgroup so they all see the initial value.
1532                     workgroupBarrier();
1533 
1534                     // All threads increment the counter.
1535                     atomicAdd(localCounter, 1);
1536 
1537                     // Synchronize the threads again to ensure they have all executed the increment
1538                     // and the following load reads the same value across all threads in the
1539                     // workgroup.
1540                     workgroupBarrier();
1541 
1542                     // Add the workgroup-only tally to the global counter.
1543                     if (sk_LocalInvocationID.x == 0) {
1544                         atomicAdd(globalCounter, atomicLoad(localCounter));
1545                     }
1546                 }
1547             )";
1548         }
1549 
1550         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1551             SkASSERT(index == 0);
1552             SkASSERT(r.fSlot == 0);
1553             SkASSERT(r.fFlow == DataFlow::kShared);
1554             return sizeof(uint32_t);
1555         }
1556 
1557         WorkgroupSize calculateGlobalDispatchSize() const override {
1558             return WorkgroupSize(kWorkgroupCount, 1, 1);
1559         }
1560 
1561         void prepareStorageBuffer(int resourceIndex,
1562                                   const ResourceDesc& r,
1563                                   void* buffer,
1564                                   size_t bufferSize) const override {
1565             SkASSERT(resourceIndex == 0);
1566             *static_cast<uint32_t*>(buffer) = 0;
1567         }
1568     } step;
1569 
1570     DispatchGroup::Builder builder(recorder.get());
1571     builder.appendStep(&step);
1572 
1573     BindBufferInfo info = builder.getSharedBufferResource(0);
1574     if (!info) {
1575         ERRORF(reporter, "shared resource at slot 0 is missing");
1576         return;
1577     }
1578 
1579     // Record the compute pass task.
1580     ComputeTask::DispatchGroupList groups;
1581     groups.push_back(builder.finalize());
1582     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1583 
1584     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1585     auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
1586 
1587     // Submit the work and wait for it to complete.
1588     std::unique_ptr<Recording> recording = recorder->snap();
1589     if (!recording) {
1590         ERRORF(reporter, "Failed to make recording");
1591         return;
1592     }
1593 
1594     InsertRecordingInfo insertInfo;
1595     insertInfo.fRecording = recording.get();
1596     context->insertRecording(insertInfo);
1597     testContext->syncedSubmit(context);
1598 
1599     // Verify the contents of the output buffer.
1600     constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
1601     const uint32_t result = static_cast<const uint32_t*>(
1602             map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
1603     REPORTER_ASSERT(reporter,
1604                     result == kExpectedCount,
1605                     "expected '%u', found '%u'",
1606                     kExpectedCount,
1607                     result);
1608 }
1609 
1610 // TODO(b/260622403): The shader tested here is identical to
1611 // `resources/sksl/compute/AtomicsOperationsOverArrayAndStruct.compute`. It would be nice to be able
1612 // to exercise SkSL features like this as part of SkSLTest.cpp instead of as a graphite test.
1613 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
1614 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsOverArrayAndStructTest,reporter,context,testContext)1615 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsOverArrayAndStructTest,
1616                                               reporter,
1617                                               context,
1618                                               testContext) {
1619     // This fails on Dawn D3D11, b/315834710
1620     if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
1621         return;
1622     }
1623 
1624     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1625 
1626     constexpr uint32_t kWorkgroupCount = 32;
1627     constexpr uint32_t kWorkgroupSize = 128;
1628 
1629     class TestComputeStep : public ComputeStep {
1630     public:
1631         TestComputeStep() : ComputeStep(
1632                 /*name=*/"TestAtomicOperationsOverArrayAndStruct",
1633                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1634                 /*resources=*/{
1635                     {
1636                         /*type=*/ResourceType::kStorageBuffer,
1637                         /*flow=*/DataFlow::kShared,
1638                         /*policy=*/ResourcePolicy::kMapped,
1639                         /*slot=*/0,
1640                         /*sksl=*/"ssbo {\n"
1641                             "   atomicUint globalCountsFirstHalf;\n"
1642                             "   atomicUint globalCountsSecondHalf;\n"
1643                             "}\n"
1644                     }
1645                 }) {}
1646         ~TestComputeStep() override = default;
1647 
1648         // Construct a kernel that increments a two global (device memory) counters across multiple
1649         // workgroups. Each workgroup maintains its own independent tallies in workgroup-shared
1650         // counters which are then added to the global counts.
1651         //
1652         // This exercises atomic store/load/add and coherent reads and writes over memory in storage
1653         // and workgroup address spaces.
1654         std::string computeSkSL() const override {
1655             return R"(
1656                 const uint WORKGROUP_SIZE = 128;
1657 
1658                 workgroup atomicUint localCounts[2];
1659 
1660                 void main() {
1661                     // Initialize the local counts.
1662                     if (sk_LocalInvocationID.x == 0) {
1663                         atomicStore(localCounts[0], 0);
1664                         atomicStore(localCounts[1], 0);
1665                     }
1666 
1667                     // Synchronize the threads in the workgroup so they all see the initial value.
1668                     workgroupBarrier();
1669 
1670                     // Each thread increments one of the local counters based on its invocation
1671                     // index.
1672                     uint idx = sk_LocalInvocationID.x < (WORKGROUP_SIZE / 2) ? 0 : 1;
1673                     atomicAdd(localCounts[idx], 1);
1674 
1675                     // Synchronize the threads again to ensure they have all executed the increments
1676                     // and the following load reads the same value across all threads in the
1677                     // workgroup.
1678                     workgroupBarrier();
1679 
1680                     // Add the workgroup-only tally to the global counter.
1681                     if (sk_LocalInvocationID.x == 0) {
1682                         atomicAdd(globalCountsFirstHalf, atomicLoad(localCounts[0]));
1683                         atomicAdd(globalCountsSecondHalf, atomicLoad(localCounts[1]));
1684                     }
1685                 }
1686             )";
1687         }
1688 
1689         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1690             SkASSERT(index == 0);
1691             SkASSERT(r.fSlot == 0);
1692             SkASSERT(r.fFlow == DataFlow::kShared);
1693             return 2 * sizeof(uint32_t);
1694         }
1695 
1696         WorkgroupSize calculateGlobalDispatchSize() const override {
1697             return WorkgroupSize(kWorkgroupCount, 1, 1);
1698         }
1699 
1700         void prepareStorageBuffer(int resourceIndex,
1701                                   const ResourceDesc& r,
1702                                   void* buffer,
1703                                   size_t bufferSize) const override {
1704             SkASSERT(resourceIndex == 0);
1705             uint32_t* data = static_cast<uint32_t*>(buffer);
1706             data[0] = 0;
1707             data[1] = 0;
1708         }
1709     } step;
1710 
1711     DispatchGroup::Builder builder(recorder.get());
1712     builder.appendStep(&step);
1713 
1714     BindBufferInfo info = builder.getSharedBufferResource(0);
1715     if (!info) {
1716         ERRORF(reporter, "shared resource at slot 0 is missing");
1717         return;
1718     }
1719 
1720     // Record the compute pass task.
1721     ComputeTask::DispatchGroupList groups;
1722     groups.push_back(builder.finalize());
1723     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1724 
1725     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1726     auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
1727 
1728     // Submit the work and wait for it to complete.
1729     std::unique_ptr<Recording> recording = recorder->snap();
1730     if (!recording) {
1731         ERRORF(reporter, "Failed to make recording");
1732         return;
1733     }
1734 
1735     InsertRecordingInfo insertInfo;
1736     insertInfo.fRecording = recording.get();
1737     context->insertRecording(insertInfo);
1738     testContext->syncedSubmit(context);
1739 
1740     // Verify the contents of the output buffer.
1741     constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize / 2;
1742 
1743     const uint32_t* ssboData = static_cast<const uint32_t*>(
1744             map_buffer(context, testContext, buffer.get(), info.fOffset));
1745     const uint32_t firstHalfCount = ssboData[0];
1746     const uint32_t secondHalfCount = ssboData[1];
1747     REPORTER_ASSERT(reporter,
1748                     firstHalfCount == kExpectedCount,
1749                     "expected '%u', found '%u'",
1750                     kExpectedCount,
1751                     firstHalfCount);
1752     REPORTER_ASSERT(reporter,
1753                     secondHalfCount == kExpectedCount,
1754                     "expected '%u', found '%u'",
1755                     kExpectedCount,
1756                     secondHalfCount);
1757 }
1758 
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearedBuffer,reporter,context,testContext)1759 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearedBuffer,
1760                                               reporter,
1761                                               context,
1762                                               testContext) {
1763     constexpr uint32_t kProblemSize = 512;
1764 
1765     // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
1766     // processes 1 vector at a time.
1767     constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
1768 
1769     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1770 
1771     // The ComputeStep requests an unmapped buffer that is zero-initialized. It writes the output to
1772     // a mapped buffer which test verifies.
1773     class TestComputeStep : public ComputeStep {
1774     public:
1775         TestComputeStep() : ComputeStep(
1776                 /*name=*/"TestClearedBuffer",
1777                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1778                 /*resources=*/{
1779                     // Zero initialized input buffer
1780                     {
1781                         // TODO(b/299979165): Declare this binding as read-only.
1782                         /*type=*/ResourceType::kStorageBuffer,
1783                         /*flow=*/DataFlow::kPrivate,
1784                         /*policy=*/ResourcePolicy::kClear,
1785                         /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
1786                     },
1787                     // Output buffer:
1788                     {
1789                         /*type=*/ResourceType::kStorageBuffer,
1790                         /*flow=*/DataFlow::kShared,  // shared to allow us to access it from the
1791                                                      // Builder
1792                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
1793                         /*slot=*/0,
1794                         /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
1795                     }
1796                 }) {}
1797         ~TestComputeStep() override = default;
1798 
1799         std::string computeSkSL() const override {
1800             return R"(
1801                 void main() {
1802                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
1803                 }
1804             )";
1805         }
1806 
1807         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1808             return sizeof(uint32_t) * kProblemSize;
1809         }
1810 
1811         void prepareStorageBuffer(int resourceIndex,
1812                                   const ResourceDesc& r,
1813                                   void* buffer,
1814                                   size_t bufferSize) const override {
1815             // Should receive this call only for the mapped buffer.
1816             SkASSERT(resourceIndex == 1);
1817         }
1818 
1819         WorkgroupSize calculateGlobalDispatchSize() const override {
1820             return WorkgroupSize(1, 1, 1);
1821         }
1822     } step;
1823 
1824     DispatchGroup::Builder builder(recorder.get());
1825     if (!builder.appendStep(&step)) {
1826         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
1827         return;
1828     }
1829 
1830     // The output buffer should have been placed in the right output slot.
1831     BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
1832     if (!outputInfo) {
1833         ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
1834         return;
1835     }
1836 
1837     // Record the compute task
1838     ComputeTask::DispatchGroupList groups;
1839     groups.push_back(builder.finalize());
1840     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1841 
1842     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1843     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
1844 
1845     // Submit the work and wait for it to complete.
1846     std::unique_ptr<Recording> recording = recorder->snap();
1847     if (!recording) {
1848         ERRORF(reporter, "Failed to make recording");
1849         return;
1850     }
1851 
1852     InsertRecordingInfo insertInfo;
1853     insertInfo.fRecording = recording.get();
1854     context->insertRecording(insertInfo);
1855     testContext->syncedSubmit(context);
1856 
1857     // Verify the contents of the output buffer.
1858     uint32_t* outData = static_cast<uint32_t*>(
1859             map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
1860     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
1861     for (unsigned int i = 0; i < kProblemSize; ++i) {
1862         const uint32_t found = outData[i];
1863         REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
1864     }
1865 }
1866 
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrdering,reporter,context,testContext)1867 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrdering,
1868                                               reporter,
1869                                               context,
1870                                               testContext) {
1871     // Initiate two independent DispatchGroups operating on the same buffer. The first group
1872     // writes garbage to the buffer and the second group copies the contents to an output buffer.
1873     // This test validates that the reads, writes, and clear occur in the expected order.
1874     constexpr uint32_t kWorkgroupSize = 64;
1875 
1876     // Initialize buffer with non-zero data.
1877     class FillWithGarbage : public ComputeStep {
1878     public:
1879         FillWithGarbage() : ComputeStep(
1880                 /*name=*/"FillWithGarbage",
1881                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1882                 /*resources=*/{
1883                     {
1884                         /*type=*/ResourceType::kStorageBuffer,
1885                         /*flow=*/DataFlow::kShared,
1886                         /*policy=*/ResourcePolicy::kNone,
1887                         /*slot=*/0,
1888                         /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
1889                     }
1890                 }) {}
1891         ~FillWithGarbage() override = default;
1892 
1893         std::string computeSkSL() const override {
1894             return R"(
1895                 void main() {
1896                     out_data[sk_GlobalInvocationID.x] = uint4(0xFE);
1897                 }
1898             )";
1899         }
1900     } garbageStep;
1901 
1902     // Second stage just copies the data to a destination buffer. This is only to verify that this
1903     // stage, issued in a separate DispatchGroup, observes the clear.
1904     class CopyBuffer : public ComputeStep {
1905     public:
1906         CopyBuffer() : ComputeStep(
1907                 /*name=*/"CopyBuffer",
1908                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1909                 /*resources=*/{
1910                     {
1911                         /*type=*/ResourceType::kStorageBuffer,
1912                         /*flow=*/DataFlow::kShared,
1913                         /*policy=*/ResourcePolicy::kNone,
1914                         /*slot=*/0,
1915                         /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
1916                     },
1917                     {
1918                         /*type=*/ResourceType::kStorageBuffer,
1919                         /*flow=*/DataFlow::kShared,
1920                         /*policy=*/ResourcePolicy::kNone,
1921                         /*slot=*/1,
1922                         /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
1923                     }
1924                 }) {}
1925         ~CopyBuffer() override = default;
1926 
1927         std::string computeSkSL() const override {
1928             return R"(
1929                 void main() {
1930                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
1931                 }
1932             )";
1933         }
1934     } copyStep;
1935 
1936     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1937     DispatchGroup::Builder builder(recorder.get());
1938 
1939     constexpr size_t kElementCount = 4 * kWorkgroupSize;
1940     constexpr size_t kBufferSize = sizeof(uint32_t) * kElementCount;
1941     auto input = recorder->priv().drawBufferManager()->getStorage(kBufferSize);
1942     auto [_, output] = recorder->priv().drawBufferManager()->getStoragePointer(kBufferSize);
1943 
1944     ComputeTask::DispatchGroupList groups;
1945 
1946     // First group.
1947     builder.assignSharedBuffer(input, 0);
1948     builder.appendStep(&garbageStep, {{1, 1, 1}});
1949     groups.push_back(builder.finalize());
1950 
1951     // Second group.
1952     builder.reset();
1953     builder.assignSharedBuffer(input, 0, ClearBuffer::kYes);
1954     builder.assignSharedBuffer(output, 1);
1955     builder.appendStep(&copyStep, {{1, 1, 1}});
1956     groups.push_back(builder.finalize());
1957 
1958     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1959     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1960     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), output.fBuffer);
1961 
1962     // Submit the work and wait for it to complete.
1963     std::unique_ptr<Recording> recording = submit_recording(context, testContext, recorder.get());
1964     if (!recording) {
1965         ERRORF(reporter, "Failed to make recording");
1966         return;
1967     }
1968 
1969     // Verify the contents of the output buffer.
1970     uint32_t* outData = static_cast<uint32_t*>(
1971             map_buffer(context, testContext, outputBuffer.get(), output.fOffset));
1972     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
1973     for (unsigned int i = 0; i < kElementCount; ++i) {
1974         const uint32_t found = outData[i];
1975         REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
1976     }
1977 }
1978 
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrderingScratchBuffers,reporter,context,testContext)1979 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrderingScratchBuffers,
1980                                               reporter,
1981                                               context,
1982                                               testContext) {
1983     // This test is the same as the ClearOrdering test but the two stages write to a recycled
1984     // ScratchBuffer. This is primarily to test ScratchBuffer reuse.
1985     constexpr uint32_t kWorkgroupSize = 64;
1986 
1987     // Initialize buffer with non-zero data.
1988     class FillWithGarbage : public ComputeStep {
1989     public:
1990         FillWithGarbage() : ComputeStep(
1991                 /*name=*/"FillWithGarbage",
1992                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1993                 /*resources=*/{
1994                     {
1995                         /*type=*/ResourceType::kStorageBuffer,
1996                         /*flow=*/DataFlow::kShared,
1997                         /*policy=*/ResourcePolicy::kNone,
1998                         /*slot=*/0,
1999                         /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
2000                     }
2001                 }) {}
2002         ~FillWithGarbage() override = default;
2003 
2004         std::string computeSkSL() const override {
2005             return R"(
2006                 void main() {
2007                     out_data[sk_GlobalInvocationID.x] = uint4(0xFE);
2008                 }
2009             )";
2010         }
2011     } garbageStep;
2012 
2013     // Second stage just copies the data to a destination buffer. This is only to verify that this
2014     // stage (issued in a separate DispatchGroup) sees the changes.
2015     class CopyBuffer : public ComputeStep {
2016     public:
2017         CopyBuffer() : ComputeStep(
2018                 /*name=*/"CopyBuffer",
2019                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2020                 /*resources=*/{
2021                     {
2022                         /*type=*/ResourceType::kStorageBuffer,
2023                         /*flow=*/DataFlow::kShared,
2024                         /*policy=*/ResourcePolicy::kNone,
2025                         /*slot=*/0,
2026                         /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
2027                     },
2028                     {
2029                         /*type=*/ResourceType::kStorageBuffer,
2030                         /*flow=*/DataFlow::kShared,
2031                         /*policy=*/ResourcePolicy::kNone,
2032                         /*slot=*/1,
2033                         /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
2034                     }
2035                 }) {}
2036         ~CopyBuffer() override = default;
2037 
2038         std::string computeSkSL() const override {
2039             return R"(
2040                 void main() {
2041                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
2042                 }
2043             )";
2044         }
2045     } copyStep;
2046 
2047     std::unique_ptr<Recorder> recorder = context->makeRecorder();
2048     DispatchGroup::Builder builder(recorder.get());
2049 
2050     constexpr size_t kElementCount = 4 * kWorkgroupSize;
2051     constexpr size_t kBufferSize = sizeof(uint32_t) * kElementCount;
2052     auto [_, output] = recorder->priv().drawBufferManager()->getStoragePointer(kBufferSize);
2053 
2054     ComputeTask::DispatchGroupList groups;
2055 
2056     // First group.
2057     {
2058         auto scratch = recorder->priv().drawBufferManager()->getScratchStorage(kBufferSize);
2059         auto input = scratch.suballocate(kBufferSize);
2060         builder.assignSharedBuffer(input, 0);
2061 
2062         // `scratch` returns to the scratch buffer pool when it goes out of scope
2063     }
2064     builder.appendStep(&garbageStep, {{1, 1, 1}});
2065     groups.push_back(builder.finalize());
2066 
2067     // Second group.
2068     builder.reset();
2069     {
2070         auto scratch = recorder->priv().drawBufferManager()->getScratchStorage(kBufferSize);
2071         auto input = scratch.suballocate(kBufferSize);
2072         builder.assignSharedBuffer(input, 0, ClearBuffer::kYes);
2073     }
2074     builder.assignSharedBuffer(output, 1);
2075     builder.appendStep(&copyStep, {{1, 1, 1}});
2076     groups.push_back(builder.finalize());
2077 
2078     recorder->priv().add(ComputeTask::Make(std::move(groups)));
2079     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2080     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), output.fBuffer);
2081 
2082     // Submit the work and wait for it to complete.
2083     std::unique_ptr<Recording> recording = submit_recording(context, testContext, recorder.get());
2084     if (!recording) {
2085         ERRORF(reporter, "Failed to make recording");
2086         return;
2087     }
2088 
2089     // Verify the contents of the output buffer.
2090     uint32_t* outData = static_cast<uint32_t*>(
2091             map_buffer(context, testContext, outputBuffer.get(), output.fOffset));
2092     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
2093     for (unsigned int i = 0; i < kElementCount; ++i) {
2094         const uint32_t found = outData[i];
2095         REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
2096     }
2097 }
2098 
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_IndirectDispatch,reporter,context,testContext)2099 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_IndirectDispatch,
2100                                               reporter,
2101                                               context,
2102                                               testContext) {
2103     // This fails on Dawn D3D11, b/315834710
2104     if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
2105         return;
2106     }
2107 
2108     std::unique_ptr<Recorder> recorder = context->makeRecorder();
2109 
2110     constexpr uint32_t kWorkgroupCount = 32;
2111     constexpr uint32_t kWorkgroupSize = 64;
2112 
2113     // `IndirectStep` populates a buffer with the global workgroup count for `CountStep`.
2114     // `CountStep` is recorded using `DispatchGroup::appendStepIndirect()` and its workgroups get
2115     // dispatched according to the values computed by `IndirectStep` on the GPU.
2116     class IndirectStep : public ComputeStep {
2117     public:
2118         IndirectStep()
2119                 : ComputeStep(
2120                           /*name=*/"TestIndirectDispatch_IndirectStep",
2121                           /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2122                           /*resources=*/
2123                           {{
2124                                   /*type=*/ResourceType::kIndirectBuffer,
2125                                   /*flow=*/DataFlow::kShared,
2126                                   /*policy=*/ResourcePolicy::kClear,
2127                                   /*slot=*/0,
2128                                   // TODO(armansito): Ideally the SSBO would have a single member of
2129                                   // type `IndirectDispatchArgs` struct type. SkSL modules don't
2130                                   // support struct declarations so this is currently not possible.
2131                                   /*sksl=*/"ssbo { uint indirect[]; }",
2132                           }}) {}
2133         ~IndirectStep() override = default;
2134 
2135         // Kernel that specifies a workgroup size of `kWorkgroupCount` to be used by the indirect
2136         // dispatch.
2137         std::string computeSkSL() const override {
2138             return R"(
2139                 // This needs to match `kWorkgroupCount` declared above.
2140                 const uint kWorkgroupCount = 32;
2141 
2142                 void main() {
2143                     if (sk_LocalInvocationID.x == 0) {
2144                         indirect[0] = kWorkgroupCount;
2145                         indirect[1] = 1;
2146                         indirect[2] = 1;
2147                     }
2148                 }
2149             )";
2150         }
2151 
2152         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2153             SkASSERT(index == 0);
2154             SkASSERT(r.fSlot == 0);
2155             SkASSERT(r.fFlow == DataFlow::kShared);
2156             return kIndirectDispatchArgumentSize;
2157         }
2158 
2159         WorkgroupSize calculateGlobalDispatchSize() const override {
2160             return WorkgroupSize(1, 1, 1);
2161         }
2162     } indirectStep;
2163 
2164     class CountStep : public ComputeStep {
2165     public:
2166         CountStep()
2167                 : ComputeStep(
2168                           /*name=*/"TestIndirectDispatch_CountStep",
2169                           /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2170                           /*resources=*/
2171                           {{
2172                                   /*type=*/ResourceType::kStorageBuffer,
2173                                   /*flow=*/DataFlow::kShared,
2174                                   /*policy=*/ResourcePolicy::kMapped,
2175                                   /*slot=*/1,
2176                                   /*sksl=*/"ssbo { atomicUint globalCounter; }",
2177                           }}) {}
2178         ~CountStep() override = default;
2179 
2180         std::string computeSkSL() const override {
2181             return R"(
2182                 workgroup atomicUint localCounter;
2183 
2184                 void main() {
2185                     // Initialize the local counter.
2186                     if (sk_LocalInvocationID.x == 0) {
2187                         atomicStore(localCounter, 0);
2188                     }
2189 
2190                     // Synchronize the threads in the workgroup so they all see the initial value.
2191                     workgroupBarrier();
2192 
2193                     // All threads increment the counter.
2194                     atomicAdd(localCounter, 1);
2195 
2196                     // Synchronize the threads again to ensure they have all executed the increment
2197                     // and the following load reads the same value across all threads in the
2198                     // workgroup.
2199                     workgroupBarrier();
2200 
2201                     // Add the workgroup-only tally to the global counter.
2202                     if (sk_LocalInvocationID.x == 0) {
2203                         atomicAdd(globalCounter, atomicLoad(localCounter));
2204                     }
2205                 }
2206             )";
2207         }
2208 
2209         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2210             SkASSERT(index == 0);
2211             SkASSERT(r.fSlot == 1);
2212             SkASSERT(r.fFlow == DataFlow::kShared);
2213             return sizeof(uint32_t);
2214         }
2215 
2216         void prepareStorageBuffer(int resourceIndex,
2217                                   const ResourceDesc& r,
2218                                   void* buffer,
2219                                   size_t bufferSize) const override {
2220             SkASSERT(resourceIndex == 0);
2221             *static_cast<uint32_t*>(buffer) = 0;
2222         }
2223     } countStep;
2224 
2225     DispatchGroup::Builder builder(recorder.get());
2226     builder.appendStep(&indirectStep);
2227     BindBufferInfo indirectBufferInfo = builder.getSharedBufferResource(0);
2228     if (!indirectBufferInfo) {
2229         ERRORF(reporter, "Shared resource at slot 0 is missing");
2230         return;
2231     }
2232     REPORTER_ASSERT(reporter, indirectBufferInfo.fSize == kIndirectDispatchArgumentSize);
2233     builder.appendStepIndirect(&countStep, indirectBufferInfo);
2234 
2235     BindBufferInfo info = builder.getSharedBufferResource(1);
2236     if (!info) {
2237         ERRORF(reporter, "Shared resource at slot 1 is missing");
2238         return;
2239     }
2240 
2241     // Record the compute pass task.
2242     ComputeTask::DispatchGroupList groups;
2243     groups.push_back(builder.finalize());
2244     recorder->priv().add(ComputeTask::Make(std::move(groups)));
2245 
2246     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2247     auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2248 
2249     // Submit the work and wait for it to complete.
2250     std::unique_ptr<Recording> recording = recorder->snap();
2251     if (!recording) {
2252         ERRORF(reporter, "Failed to make recording");
2253         return;
2254     }
2255 
2256     InsertRecordingInfo insertInfo;
2257     insertInfo.fRecording = recording.get();
2258     context->insertRecording(insertInfo);
2259     testContext->syncedSubmit(context);
2260 
2261     // Verify the contents of the output buffer.
2262     constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2263     const uint32_t result = static_cast<const uint32_t*>(
2264             map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2265     REPORTER_ASSERT(reporter,
2266                     result == kExpectedCount,
2267                     "expected '%u', found '%u'",
2268                     kExpectedCount,
2269                     result);
2270 }
2271 
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal,reporter,context,testContext)2272 DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal,
2273                                     reporter,
2274                                     context,
2275                                     testContext) {
2276     std::unique_ptr<Recorder> recorder = context->makeRecorder();
2277 
2278     constexpr uint32_t kWorkgroupCount = 32;
2279     constexpr uint32_t kWorkgroupSize = 1024;
2280 
2281     class TestComputeStep : public ComputeStep {
2282     public:
2283         TestComputeStep() : ComputeStep(
2284                 /*name=*/"TestAtomicOperationsMetal",
2285                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2286                 /*resources=*/{
2287                     {
2288                         /*type=*/ResourceType::kStorageBuffer,
2289                         /*flow=*/DataFlow::kShared,
2290                         /*policy=*/ResourcePolicy::kMapped,
2291                         /*slot=*/0,
2292                     }
2293                 },
2294                 /*workgroupBuffers=*/{},
2295                 /*baseFlags=*/Flags::kSupportsNativeShader) {}
2296         ~TestComputeStep() override = default;
2297 
2298         NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
2299             SkASSERT(format == NativeShaderFormat::kMSL);
2300             static constexpr std::string_view kSource = R"(
2301                 #include <metal_stdlib>
2302 
2303                 using namespace metal;
2304 
2305                 kernel void atomicCount(uint3 localId [[thread_position_in_threadgroup]],
2306                                         device atomic_uint& globalCounter [[buffer(0)]]) {
2307                     threadgroup atomic_uint localCounter;
2308 
2309                     // Initialize the local counter.
2310                     if (localId.x == 0u) {
2311                         atomic_store_explicit(&localCounter, 0u, memory_order_relaxed);
2312                     }
2313 
2314                     // Synchronize the threads in the workgroup so they all see the initial value.
2315                     threadgroup_barrier(mem_flags::mem_threadgroup);
2316 
2317                     // All threads increment the counter.
2318                     atomic_fetch_add_explicit(&localCounter, 1u, memory_order_relaxed);
2319 
2320                     // Synchronize the threads again to ensure they have all executed the increment
2321                     // and the following load reads the same value across all threads in the
2322                     // workgroup.
2323                     threadgroup_barrier(mem_flags::mem_threadgroup);
2324 
2325                     // Add the workgroup-only tally to the global counter.
2326                     if (localId.x == 0u) {
2327                         uint tally = atomic_load_explicit(&localCounter, memory_order_relaxed);
2328                         atomic_fetch_add_explicit(&globalCounter, tally, memory_order_relaxed);
2329                     }
2330                 }
2331             )";
2332             return {kSource, "atomicCount"};
2333         }
2334 
2335         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2336             SkASSERT(index == 0);
2337             SkASSERT(r.fSlot == 0);
2338             SkASSERT(r.fFlow == DataFlow::kShared);
2339             return sizeof(uint32_t);
2340         }
2341 
2342         WorkgroupSize calculateGlobalDispatchSize() const override {
2343             return WorkgroupSize(kWorkgroupCount, 1, 1);
2344         }
2345 
2346         void prepareStorageBuffer(int resourceIndex,
2347                                   const ResourceDesc& r,
2348                                   void* buffer,
2349                                   size_t bufferSize) const override {
2350             SkASSERT(resourceIndex == 0);
2351             *static_cast<uint32_t*>(buffer) = 0;
2352         }
2353     } step;
2354 
2355     DispatchGroup::Builder builder(recorder.get());
2356     builder.appendStep(&step);
2357 
2358     BindBufferInfo info = builder.getSharedBufferResource(0);
2359     if (!info) {
2360         ERRORF(reporter, "shared resource at slot 0 is missing");
2361         return;
2362     }
2363 
2364     // Record the compute pass task.
2365     ComputeTask::DispatchGroupList groups;
2366     groups.push_back(builder.finalize());
2367     recorder->priv().add(ComputeTask::Make(std::move(groups)));
2368 
2369     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2370     auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2371 
2372     // Submit the work and wait for it to complete.
2373     std::unique_ptr<Recording> recording = recorder->snap();
2374     if (!recording) {
2375         ERRORF(reporter, "Failed to make recording");
2376         return;
2377     }
2378 
2379     InsertRecordingInfo insertInfo;
2380     insertInfo.fRecording = recording.get();
2381     context->insertRecording(insertInfo);
2382     testContext->syncedSubmit(context);
2383 
2384     // Verify the contents of the output buffer.
2385     constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2386     const uint32_t result = static_cast<const uint32_t*>(
2387             map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2388     REPORTER_ASSERT(reporter,
2389                     result == kExpectedCount,
2390                     "expected '%u', found '%u'",
2391                     kExpectedCount,
2392                     result);
2393 }
2394 
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_WorkgroupBufferDescMetal,reporter,context,testContext)2395 DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_WorkgroupBufferDescMetal,
2396                                     reporter,
2397                                     context,
2398                                     testContext) {
2399     std::unique_ptr<Recorder> recorder = context->makeRecorder();
2400 
2401     constexpr uint32_t kWorkgroupCount = 32;
2402     constexpr uint32_t kWorkgroupSize = 1024;
2403 
2404     class TestComputeStep : public ComputeStep {
2405     public:
2406         TestComputeStep() : ComputeStep(
2407                 /*name=*/"TestAtomicOperationsMetal",
2408                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2409                 /*resources=*/{
2410                     {
2411                         /*type=*/ResourceType::kStorageBuffer,
2412                         /*flow=*/DataFlow::kShared,
2413                         /*policy=*/ResourcePolicy::kMapped,
2414                         /*slot=*/0,
2415                     }
2416                 },
2417                 /*workgroupBuffers=*/{
2418                     {
2419                         /*size=*/sizeof(uint32_t),
2420                         /*index=*/0u,
2421                     }
2422                 },
2423                 /*baseFlags=*/Flags::kSupportsNativeShader) {}
2424         ~TestComputeStep() override = default;
2425 
2426         // This is the same MSL kernel as in Compute_NativeShaderSourceMetal, except `localCounter`
2427         // is an entry-point parameter instead of a local variable. This forces the workgroup
2428         // binding to be encoded explicitly in the command encoder.
2429         NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
2430             SkASSERT(format == NativeShaderFormat::kMSL);
2431             static constexpr std::string_view kSource = R"(
2432                 #include <metal_stdlib>
2433 
2434                 using namespace metal;
2435 
2436                 kernel void atomicCount(uint3 localId [[thread_position_in_threadgroup]],
2437                                         device atomic_uint& globalCounter [[buffer(0)]],
2438                                         threadgroup atomic_uint& localCounter [[threadgroup(0)]]) {
2439                     // Initialize the local counter.
2440                     if (localId.x == 0u) {
2441                         atomic_store_explicit(&localCounter, 0u, memory_order_relaxed);
2442                     }
2443 
2444                     // Synchronize the threads in the workgroup so they all see the initial value.
2445                     threadgroup_barrier(mem_flags::mem_threadgroup);
2446 
2447                     // All threads increment the counter.
2448                     atomic_fetch_add_explicit(&localCounter, 1u, memory_order_relaxed);
2449 
2450                     // Synchronize the threads again to ensure they have all executed the increment
2451                     // and the following load reads the same value across all threads in the
2452                     // workgroup.
2453                     threadgroup_barrier(mem_flags::mem_threadgroup);
2454 
2455                     // Add the workgroup-only tally to the global counter.
2456                     if (localId.x == 0u) {
2457                         uint tally = atomic_load_explicit(&localCounter, memory_order_relaxed);
2458                         atomic_fetch_add_explicit(&globalCounter, tally, memory_order_relaxed);
2459                     }
2460                 }
2461             )";
2462             return {kSource, "atomicCount"};
2463         }
2464 
2465         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2466             SkASSERT(index == 0);
2467             SkASSERT(r.fSlot == 0);
2468             SkASSERT(r.fFlow == DataFlow::kShared);
2469             return sizeof(uint32_t);
2470         }
2471 
2472         WorkgroupSize calculateGlobalDispatchSize() const override {
2473             return WorkgroupSize(kWorkgroupCount, 1, 1);
2474         }
2475 
2476         void prepareStorageBuffer(int resourceIndex,
2477                                   const ResourceDesc& r,
2478                                   void* buffer,
2479                                   size_t bufferSize) const override {
2480             SkASSERT(resourceIndex == 0);
2481             *static_cast<uint32_t*>(buffer) = 0;
2482         }
2483     } step;
2484 
2485     DispatchGroup::Builder builder(recorder.get());
2486     builder.appendStep(&step);
2487 
2488     BindBufferInfo info = builder.getSharedBufferResource(0);
2489     if (!info) {
2490         ERRORF(reporter, "shared resource at slot 0 is missing");
2491         return;
2492     }
2493 
2494     // Record the compute pass task.
2495     ComputeTask::DispatchGroupList groups;
2496     groups.push_back(builder.finalize());
2497     recorder->priv().add(ComputeTask::Make(std::move(groups)));
2498 
2499     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2500     auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2501 
2502     // Submit the work and wait for it to complete.
2503     std::unique_ptr<Recording> recording = recorder->snap();
2504     if (!recording) {
2505         ERRORF(reporter, "Failed to make recording");
2506         return;
2507     }
2508 
2509     InsertRecordingInfo insertInfo;
2510     insertInfo.fRecording = recording.get();
2511     context->insertRecording(insertInfo);
2512     testContext->syncedSubmit(context);
2513 
2514     // Verify the contents of the output buffer.
2515     constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2516     const uint32_t result = static_cast<const uint32_t*>(
2517             map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2518     REPORTER_ASSERT(reporter,
2519                     result == kExpectedCount,
2520                     "expected '%u', found '%u'",
2521                     kExpectedCount,
2522                     result);
2523 }
2524 
DEF_GRAPHITE_TEST_FOR_DAWN_CONTEXT(Compute_NativeShaderSourceWGSL,reporter,context,testContext)2525 DEF_GRAPHITE_TEST_FOR_DAWN_CONTEXT(Compute_NativeShaderSourceWGSL, reporter, context, testContext) {
2526     // This fails on Dawn D3D11, b/315834710
2527     if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
2528         return;
2529     }
2530 
2531     std::unique_ptr<Recorder> recorder = context->makeRecorder();
2532 
2533     constexpr uint32_t kWorkgroupCount = 32;
2534     // The WebGPU compat default workgroup size limit is 128.
2535     constexpr uint32_t kWorkgroupSize = 128;
2536 
2537     class TestComputeStep : public ComputeStep {
2538     public:
2539         TestComputeStep() : ComputeStep(
2540                 /*name=*/"TestAtomicOperationsWGSL",
2541                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2542                 /*resources=*/{
2543                     {
2544                         /*type=*/ResourceType::kStorageBuffer,
2545                         /*flow=*/DataFlow::kShared,
2546                         /*policy=*/ResourcePolicy::kMapped,
2547                         /*slot=*/0,
2548                     }
2549                 },
2550                 /*workgroupBuffers=*/{},
2551                 /*baseFlags=*/Flags::kSupportsNativeShader) {}
2552         ~TestComputeStep() override = default;
2553 
2554         NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
2555             SkASSERT(format == NativeShaderFormat::kWGSL);
2556             static constexpr std::string_view kSource = R"(
2557                 @group(0) @binding(0) var<storage, read_write> globalCounter: atomic<u32>;
2558 
2559                 var<workgroup> localCounter: atomic<u32>;
2560 
2561                 @compute @workgroup_size(128)
2562                 fn atomicCount(@builtin(local_invocation_id) localId: vec3u) {
2563                     // Initialize the local counter.
2564                     if localId.x == 0u {
2565                         atomicStore(&localCounter, 0u);
2566                     }
2567 
2568                     // Synchronize the threads in the workgroup so they all see the initial value.
2569                     workgroupBarrier();
2570 
2571                     // All threads increment the counter.
2572                     atomicAdd(&localCounter, 1u);
2573 
2574                     // Synchronize the threads again to ensure they have all executed the increment
2575                     // and the following load reads the same value across all threads in the
2576                     // workgroup.
2577                     workgroupBarrier();
2578 
2579                     // Add the workgroup-only tally to the global counter.
2580                     if localId.x == 0u {
2581                         let tally = atomicLoad(&localCounter);
2582                         atomicAdd(&globalCounter, tally);
2583                     }
2584                 }
2585             )";
2586             return {kSource, "atomicCount"};
2587         }
2588 
2589         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2590             SkASSERT(index == 0);
2591             SkASSERT(r.fSlot == 0);
2592             SkASSERT(r.fFlow == DataFlow::kShared);
2593             return sizeof(uint32_t);
2594         }
2595 
2596         WorkgroupSize calculateGlobalDispatchSize() const override {
2597             return WorkgroupSize(kWorkgroupCount, 1, 1);
2598         }
2599 
2600         void prepareStorageBuffer(int resourceIndex,
2601                                   const ResourceDesc& r,
2602                                   void* buffer,
2603                                   size_t bufferSize) const override {
2604             SkASSERT(resourceIndex == 0);
2605             *static_cast<uint32_t*>(buffer) = 0;
2606         }
2607     } step;
2608 
2609     DispatchGroup::Builder builder(recorder.get());
2610     builder.appendStep(&step);
2611 
2612     BindBufferInfo info = builder.getSharedBufferResource(0);
2613     if (!info) {
2614         ERRORF(reporter, "shared resource at slot 0 is missing");
2615         return;
2616     }
2617 
2618     // Record the compute pass task.
2619     ComputeTask::DispatchGroupList groups;
2620     groups.push_back(builder.finalize());
2621     recorder->priv().add(ComputeTask::Make(std::move(groups)));
2622 
2623     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2624     auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2625 
2626     // Submit the work and wait for it to complete.
2627     std::unique_ptr<Recording> recording = recorder->snap();
2628     if (!recording) {
2629         ERRORF(reporter, "Failed to make recording");
2630         return;
2631     }
2632 
2633     InsertRecordingInfo insertInfo;
2634     insertInfo.fRecording = recording.get();
2635     context->insertRecording(insertInfo);
2636     testContext->syncedSubmit(context);
2637 
2638     // Verify the contents of the output buffer.
2639     constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2640     const uint32_t result = static_cast<const uint32_t*>(
2641             map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2642     REPORTER_ASSERT(reporter,
2643                     result == kExpectedCount,
2644                     "expected '%u', found '%u'",
2645                     kExpectedCount,
2646                     result);
2647 }
2648