1 /*
2 * Copyright 2022 Google LLC
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "tests/Test.h"
9
10 #include "include/core/SkBitmap.h"
11 #include "include/gpu/graphite/Context.h"
12 #include "include/gpu/graphite/Recorder.h"
13 #include "include/gpu/graphite/Recording.h"
14 #include "src/gpu/graphite/Buffer.h"
15 #include "src/gpu/graphite/Caps.h"
16 #include "src/gpu/graphite/ComputePipelineDesc.h"
17 #include "src/gpu/graphite/ComputeTypes.h"
18 #include "src/gpu/graphite/ContextPriv.h"
19 #include "src/gpu/graphite/RecorderPriv.h"
20 #include "src/gpu/graphite/ResourceProvider.h"
21 #include "src/gpu/graphite/UniformManager.h"
22 #include "src/gpu/graphite/compute/ComputeStep.h"
23 #include "src/gpu/graphite/compute/DispatchGroup.h"
24 #include "src/gpu/graphite/task/ComputeTask.h"
25 #include "src/gpu/graphite/task/CopyTask.h"
26 #include "src/gpu/graphite/task/SynchronizeToCpuTask.h"
27 #include "src/gpu/graphite/task/UploadTask.h"
28
29 #include "tools/graphite/GraphiteTestContext.h"
30
31 using namespace skgpu::graphite;
32 using namespace skiatest::graphite;
33
34 namespace {
35
map_buffer(Context * context,skiatest::graphite::GraphiteTestContext * testContext,Buffer * buffer,size_t offset)36 void* map_buffer(Context* context,
37 skiatest::graphite::GraphiteTestContext* testContext,
38 Buffer* buffer,
39 size_t offset) {
40 SkASSERT(buffer);
41 if (context->priv().caps()->bufferMapsAreAsync()) {
42 buffer->asyncMap();
43 while (!buffer->isMapped()) {
44 testContext->tick();
45 }
46 }
47 std::byte* ptr = static_cast<std::byte*>(buffer->map());
48 SkASSERT(ptr);
49
50 return ptr + offset;
51 }
52
sync_buffer_to_cpu(Recorder * recorder,const Buffer * buffer)53 sk_sp<Buffer> sync_buffer_to_cpu(Recorder* recorder, const Buffer* buffer) {
54 if (recorder->priv().caps()->drawBufferCanBeMappedForReadback()) {
55 // `buffer` can be mapped directly, however it may still require a synchronization step
56 // by the underlying API (e.g. a managed buffer in Metal). SynchronizeToCpuTask
57 // automatically handles this for us.
58 recorder->priv().add(SynchronizeToCpuTask::Make(sk_ref_sp(buffer)));
59 return sk_ref_sp(buffer);
60 }
61
62 // The backend requires a transfer buffer for CPU read-back
63 auto xferBuffer =
64 recorder->priv().resourceProvider()->findOrCreateBuffer(buffer->size(),
65 BufferType::kXferGpuToCpu,
66 AccessPattern::kHostVisible,
67 "ComputeTest_TransferToCpu");
68 SkASSERT(xferBuffer);
69
70 recorder->priv().add(CopyBufferToBufferTask::Make(buffer,
71 /*srcOffset=*/0,
72 xferBuffer,
73 /*dstOffset=*/0,
74 buffer->size()));
75 return xferBuffer;
76 }
77
submit_recording(Context * context,GraphiteTestContext * testContext,Recorder * recorder)78 std::unique_ptr<Recording> submit_recording(Context* context,
79 GraphiteTestContext* testContext,
80 Recorder* recorder) {
81 std::unique_ptr<Recording> recording = recorder->snap();
82 if (!recording) {
83 return nullptr;
84 }
85
86 InsertRecordingInfo insertInfo;
87 insertInfo.fRecording = recording.get();
88 context->insertRecording(insertInfo);
89 testContext->syncedSubmit(context);
90
91 return recording;
92 }
93
is_dawn_or_metal_context_type(skiatest::GpuContextType ctxType)94 bool is_dawn_or_metal_context_type(skiatest::GpuContextType ctxType) {
95 return skiatest::IsDawnContextType(ctxType) || skiatest::IsMetalContextType(ctxType);
96 }
97
98 } // namespace
99
100 #define DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS( \
101 name, reporter, graphite_context, test_context) \
102 DEF_GRAPHITE_TEST_FOR_CONTEXTS(name, \
103 is_dawn_or_metal_context_type, \
104 reporter, \
105 graphite_context, \
106 test_context, \
107 CtsEnforcement::kNever)
108
109 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
110 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SingleDispatchTest,reporter,context,testContext)111 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SingleDispatchTest,
112 reporter,
113 context,
114 testContext) {
115 constexpr uint32_t kProblemSize = 512;
116 constexpr float kFactor = 4.f;
117
118 // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
119 // processes 1 vector at a time.
120 constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
121
122 std::unique_ptr<Recorder> recorder = context->makeRecorder();
123
124 class TestComputeStep : public ComputeStep {
125 public:
126 // TODO(skia:40045541): SkSL doesn't support std430 layout well, so the buffers
127 // below all pack their data into vectors to be compatible with SPIR-V/WGSL.
128 TestComputeStep() : ComputeStep(
129 /*name=*/"TestArrayMultiply",
130 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
131 /*resources=*/{
132 // Input buffer:
133 {
134 // TODO(b/299979165): Declare this binding as read-only.
135 /*type=*/ResourceType::kStorageBuffer,
136 /*flow=*/DataFlow::kPrivate,
137 /*policy=*/ResourcePolicy::kMapped,
138 /*sksl=*/"inputBlock {\n"
139 " float factor;\n"
140 " layout(offset=16) float4 in_data[];\n"
141 "}",
142 },
143 // Output buffer:
144 {
145 /*type=*/ResourceType::kStorageBuffer,
146 /*flow=*/DataFlow::kShared, // shared to allow us to access it from the
147 // Builder
148 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
149 /*slot=*/0,
150 /*sksl=*/"outputBlock { float4 out_data[]; }",
151 }
152 }) {}
153 ~TestComputeStep() override = default;
154
155 // A kernel that multiplies a large array of floats by a supplied factor.
156 std::string computeSkSL() const override {
157 return R"(
158 void main() {
159 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
160 }
161 )";
162 }
163
164 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
165 if (index == 0) {
166 SkASSERT(r.fFlow == DataFlow::kPrivate);
167 return sizeof(float) * (kProblemSize + 4);
168 }
169 SkASSERT(index == 1);
170 SkASSERT(r.fSlot == 0);
171 SkASSERT(r.fFlow == DataFlow::kShared);
172 return sizeof(float) * kProblemSize;
173 }
174
175 void prepareStorageBuffer(int resourceIndex,
176 const ResourceDesc& r,
177 void* buffer,
178 size_t bufferSize) const override {
179 // Only initialize the input buffer.
180 if (resourceIndex != 0) {
181 return;
182 }
183 SkASSERT(r.fFlow == DataFlow::kPrivate);
184
185 size_t dataCount = sizeof(float) * (kProblemSize + 4);
186 SkASSERT(bufferSize == dataCount);
187 SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
188 inData[0] = kFactor;
189 for (unsigned int i = 0; i < kProblemSize; ++i) {
190 inData[i + 4] = i + 1;
191 }
192 }
193
194 WorkgroupSize calculateGlobalDispatchSize() const override {
195 return WorkgroupSize(1, 1, 1);
196 }
197 } step;
198
199 DispatchGroup::Builder builder(recorder.get());
200 if (!builder.appendStep(&step)) {
201 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
202 return;
203 }
204
205 // The output buffer should have been placed in the right output slot.
206 BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
207 if (!outputInfo) {
208 ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
209 return;
210 }
211
212 // Record the compute task
213 ComputeTask::DispatchGroupList groups;
214 groups.push_back(builder.finalize());
215 recorder->priv().add(ComputeTask::Make(std::move(groups)));
216
217 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
218 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
219
220 // Submit the work and wait for it to complete.
221 std::unique_ptr<Recording> recording = recorder->snap();
222 if (!recording) {
223 ERRORF(reporter, "Failed to make recording");
224 return;
225 }
226
227 InsertRecordingInfo insertInfo;
228 insertInfo.fRecording = recording.get();
229 context->insertRecording(insertInfo);
230 testContext->syncedSubmit(context);
231
232 // Verify the contents of the output buffer.
233 float* outData = static_cast<float*>(
234 map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
235 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
236 for (unsigned int i = 0; i < kProblemSize; ++i) {
237 const float expected = (i + 1) * kFactor;
238 const float found = outData[i];
239 REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
240 }
241 }
242
243 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
244 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_DispatchGroupTest,reporter,context,testContext)245 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_DispatchGroupTest,
246 reporter,
247 context,
248 testContext) {
249 // TODO(b/315834710): This fails on Dawn D3D11
250 if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
251 return;
252 }
253
254 constexpr uint32_t kProblemSize = 512;
255 constexpr float kFactor1 = 4.f;
256 constexpr float kFactor2 = 3.f;
257
258 // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
259 // processes 1 vector at a time.
260 constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
261
262 std::unique_ptr<Recorder> recorder = context->makeRecorder();
263
264 // Define two steps that perform two multiplication passes over the same input.
265
266 class TestComputeStep1 : public ComputeStep {
267 public:
268 // TODO(skia:40045541): SkSL doesn't support std430 layout well, so the buffers
269 // below all pack their data into vectors to be compatible with SPIR-V/WGSL.
270 TestComputeStep1() : ComputeStep(
271 /*name=*/"TestArrayMultiplyFirstPass",
272 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
273 /*resources=*/{
274 // Input buffer:
275 {
276 // TODO(b/299979165): Declare this binding as read-only.
277 /*type=*/ResourceType::kStorageBuffer,
278 /*flow=*/DataFlow::kPrivate,
279 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
280 /*sksl=*/"inputBlock {\n"
281 " float factor;\n"
282 " layout(offset=16) float4 in_data[];\n"
283 "}",
284 },
285 // Output buffers:
286 {
287 /*type=*/ResourceType::kStorageBuffer,
288 /*flow=*/DataFlow::kShared,
289 /*policy=*/ResourcePolicy::kNone, // GPU-only, read by second step
290 /*slot=*/0,
291 /*sksl=*/"outputBlock1 { float4 forward_data[]; }",
292 },
293 {
294 /*type=*/ResourceType::kStorageBuffer,
295 /*flow=*/DataFlow::kShared,
296 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
297 /*slot=*/1,
298 /*sksl=*/"outputBlock2 { float2 extra_data; }",
299 }
300 }) {}
301 ~TestComputeStep1() override = default;
302
303 // A kernel that multiplies a large array of floats by a supplied factor.
304 std::string computeSkSL() const override {
305 return R"(
306 void main() {
307 uint idx = sk_GlobalInvocationID.x;
308 forward_data[idx] = in_data[idx] * factor;
309 if (idx == 0) {
310 extra_data.x = factor;
311 extra_data.y = 2 * factor;
312 }
313 }
314 )";
315 }
316
317 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
318 if (index == 0) {
319 SkASSERT(r.fFlow == DataFlow::kPrivate);
320 return sizeof(float) * (kProblemSize + 4);
321 }
322 if (index == 1) {
323 SkASSERT(r.fFlow == DataFlow::kShared);
324 SkASSERT(r.fSlot == 0);
325 return sizeof(float) * kProblemSize;
326 }
327
328 SkASSERT(index == 2);
329 SkASSERT(r.fSlot == 1);
330 SkASSERT(r.fFlow == DataFlow::kShared);
331 return 2 * sizeof(float);
332 }
333
334 void prepareStorageBuffer(int resourceIndex,
335 const ResourceDesc& r,
336 void* buffer,
337 size_t bufferSize) const override {
338 if (resourceIndex != 0) {
339 return;
340 }
341
342 size_t dataCount = sizeof(float) * (kProblemSize + 4);
343 SkASSERT(bufferSize == dataCount);
344 SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
345 inData[0] = kFactor1;
346 for (unsigned int i = 0; i < kProblemSize; ++i) {
347 inData[i + 4] = i + 1;
348 }
349 }
350
351 WorkgroupSize calculateGlobalDispatchSize() const override {
352 return WorkgroupSize(1, 1, 1);
353 }
354 } step1;
355
356 class TestComputeStep2 : public ComputeStep {
357 public:
358 TestComputeStep2() : ComputeStep(
359 /*name=*/"TestArrayMultiplySecondPass",
360 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
361 /*resources=*/{
362 // Input buffer:
363 {
364 /*type=*/ResourceType::kStorageBuffer,
365 /*flow=*/DataFlow::kShared,
366 /*policy=*/ResourcePolicy::kNone, // GPU-only
367 /*slot=*/0, // this is the output from the first step
368 /*sksl=*/"inputBlock { float4 in_data[]; }",
369 },
370 {
371 /*type=*/ResourceType::kStorageBuffer,
372 /*flow=*/DataFlow::kPrivate,
373 /*policy=*/ResourcePolicy::kMapped,
374 /*sksl=*/"factorBlock { float factor; }"
375 },
376 // Output buffer:
377 {
378 /*type=*/ResourceType::kStorageBuffer,
379 /*flow=*/DataFlow::kShared,
380 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
381 /*slot=*/2,
382 /*sksl=*/"outputBlock { float4 out_data[]; }",
383 }
384 }) {}
385 ~TestComputeStep2() override = default;
386
387 // A kernel that multiplies a large array of floats by a supplied factor.
388 std::string computeSkSL() const override {
389 return R"(
390 void main() {
391 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
392 }
393 )";
394 }
395
396 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
397 SkASSERT(index != 0);
398 if (index == 1) {
399 SkASSERT(r.fFlow == DataFlow::kPrivate);
400 return sizeof(float) * 4;
401 }
402 SkASSERT(index == 2);
403 SkASSERT(r.fSlot == 2);
404 SkASSERT(r.fFlow == DataFlow::kShared);
405 return sizeof(float) * kProblemSize;
406 }
407
408 void prepareStorageBuffer(int resourceIndex,
409 const ResourceDesc& r,
410 void* buffer,
411 size_t bufferSize) const override {
412 if (resourceIndex != 1) {
413 return;
414 }
415 SkASSERT(r.fFlow == DataFlow::kPrivate);
416 *static_cast<float*>(buffer) = kFactor2;
417 }
418
419 WorkgroupSize calculateGlobalDispatchSize() const override {
420 return WorkgroupSize(1, 1, 1);
421 }
422 } step2;
423
424 DispatchGroup::Builder builder(recorder.get());
425 builder.appendStep(&step1);
426 builder.appendStep(&step2);
427
428 // Slots 0, 1, and 2 should all contain shared buffers. Slot 1 contains the extra output buffer
429 // from step 1 while slot 2 contains the result of the second multiplication pass from step 1.
430 // Slot 0 is not mappable.
431 REPORTER_ASSERT(reporter,
432 std::holds_alternative<BindBufferInfo>(builder.outputTable().fSharedSlots[0]),
433 "shared resource at slot 0 is missing");
434 BindBufferInfo outputInfo = builder.getSharedBufferResource(2);
435 if (!outputInfo) {
436 ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
437 return;
438 }
439
440 // Extra output buffer from step 1 (corresponding to 'outputBlock2')
441 BindBufferInfo extraOutputInfo = builder.getSharedBufferResource(1);
442 if (!extraOutputInfo) {
443 ERRORF(reporter, "shared resource at slot 1 is missing");
444 return;
445 }
446
447 // Record the compute task
448 ComputeTask::DispatchGroupList groups;
449 groups.push_back(builder.finalize());
450 recorder->priv().add(ComputeTask::Make(std::move(groups)));
451
452 // Ensure the output buffers get synchronized to the CPU once the GPU submission has finished.
453 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
454 auto extraOutputBuffer = sync_buffer_to_cpu(recorder.get(), extraOutputInfo.fBuffer);
455
456 // Submit the work and wait for it to complete.
457 std::unique_ptr<Recording> recording = recorder->snap();
458 if (!recording) {
459 ERRORF(reporter, "Failed to make recording");
460 return;
461 }
462
463 InsertRecordingInfo insertInfo;
464 insertInfo.fRecording = recording.get();
465 context->insertRecording(insertInfo);
466 testContext->syncedSubmit(context);
467
468 // Verify the contents of the output buffer from step 2
469 float* outData = static_cast<float*>(
470 map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
471 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
472 for (unsigned int i = 0; i < kProblemSize; ++i) {
473 const float expected = (i + 1) * kFactor1 * kFactor2;
474 const float found = outData[i];
475 REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
476 }
477
478 // Verify the contents of the extra output buffer from step 1
479 float* extraOutData = static_cast<float*>(
480 map_buffer(context, testContext, extraOutputBuffer.get(), extraOutputInfo.fOffset));
481 SkASSERT(extraOutputBuffer->isMapped() && extraOutData != nullptr);
482 REPORTER_ASSERT(reporter,
483 kFactor1 == extraOutData[0],
484 "expected '%f', found '%f'",
485 kFactor1,
486 extraOutData[0]);
487 REPORTER_ASSERT(reporter,
488 2 * kFactor1 == extraOutData[1],
489 "expected '%f', found '%f'",
490 2 * kFactor2,
491 extraOutData[1]);
492 }
493
494 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
495 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_UniformBufferTest,reporter,context,testContext)496 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_UniformBufferTest,
497 reporter,
498 context,
499 testContext) {
500 // TODO(b/315834710): This fails on Dawn D3D11
501 if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
502 return;
503 }
504
505 constexpr uint32_t kProblemSize = 512;
506 constexpr float kFactor = 4.f;
507
508 // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
509 // processes 1 vector at a time.
510 constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
511
512 std::unique_ptr<Recorder> recorder = context->makeRecorder();
513
514 class TestComputeStep : public ComputeStep {
515 public:
516 TestComputeStep() : ComputeStep(
517 /*name=*/"TestArrayMultiply",
518 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
519 /*resources=*/{
520 // Uniform buffer:
521 {
522 /*type=*/ResourceType::kUniformBuffer,
523 /*flow=*/DataFlow::kPrivate,
524 /*policy=*/ResourcePolicy::kMapped,
525 /*sksl=*/"uniformBlock { float factor; }"
526 },
527 // Input buffer:
528 {
529 /*type=*/ResourceType::kStorageBuffer,
530 /*flow=*/DataFlow::kPrivate,
531 /*policy=*/ResourcePolicy::kMapped,
532 /*sksl=*/"inputBlock { float4 in_data[]; }",
533 },
534 // Output buffer:
535 {
536 /*type=*/ResourceType::kStorageBuffer,
537 /*flow=*/DataFlow::kShared, // shared to allow us to access it from the
538 // Builder
539 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
540 /*slot=*/0,
541 /*sksl=*/"outputBlock { float4 out_data[]; }",
542 }
543 }) {}
544 ~TestComputeStep() override = default;
545
546 // A kernel that multiplies a large array of floats by a supplied factor.
547 std::string computeSkSL() const override {
548 return R"(
549 void main() {
550 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
551 }
552 )";
553 }
554
555 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
556 if (index == 0) {
557 SkASSERT(r.fFlow == DataFlow::kPrivate);
558 return sizeof(float);
559 }
560 if (index == 1) {
561 SkASSERT(r.fFlow == DataFlow::kPrivate);
562 return sizeof(float) * kProblemSize;
563 }
564 SkASSERT(index == 2);
565 SkASSERT(r.fSlot == 0);
566 SkASSERT(r.fFlow == DataFlow::kShared);
567 return sizeof(float) * kProblemSize;
568 }
569
570 void prepareStorageBuffer(int resourceIndex,
571 const ResourceDesc& r,
572 void* buffer,
573 size_t bufferSize) const override {
574 // Only initialize the input storage buffer.
575 if (resourceIndex != 1) {
576 return;
577 }
578 SkASSERT(r.fFlow == DataFlow::kPrivate);
579 size_t dataCount = sizeof(float) * kProblemSize;
580 SkASSERT(bufferSize == dataCount);
581 SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
582 for (unsigned int i = 0; i < kProblemSize; ++i) {
583 inData[i] = i + 1;
584 }
585 }
586
587 void prepareUniformBuffer(int resourceIndex,
588 const ResourceDesc&,
589 UniformManager* mgr) const override {
590 SkASSERT(resourceIndex == 0);
591 SkDEBUGCODE(
592 const Uniform uniforms[] = {{"factor", SkSLType::kFloat}};
593 mgr->setExpectedUniforms(uniforms, /*isSubstruct=*/false);
594 )
595 mgr->write(kFactor);
596 }
597
598 WorkgroupSize calculateGlobalDispatchSize() const override {
599 return WorkgroupSize(1, 1, 1);
600 }
601 } step;
602
603 DispatchGroup::Builder builder(recorder.get());
604 if (!builder.appendStep(&step)) {
605 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
606 return;
607 }
608
609 // The output buffer should have been placed in the right output slot.
610 BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
611 if (!outputInfo) {
612 ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
613 return;
614 }
615
616 // Record the compute task
617 ComputeTask::DispatchGroupList groups;
618 groups.push_back(builder.finalize());
619 recorder->priv().add(ComputeTask::Make(std::move(groups)));
620
621 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
622 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
623
624 // Submit the work and wait for it to complete.
625 std::unique_ptr<Recording> recording = recorder->snap();
626 if (!recording) {
627 ERRORF(reporter, "Failed to make recording");
628 return;
629 }
630
631 InsertRecordingInfo insertInfo;
632 insertInfo.fRecording = recording.get();
633 context->insertRecording(insertInfo);
634 testContext->syncedSubmit(context);
635
636 // Verify the contents of the output buffer.
637 float* outData = static_cast<float*>(
638 map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
639 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
640 for (unsigned int i = 0; i < kProblemSize; ++i) {
641 const float expected = (i + 1) * kFactor;
642 const float found = outData[i];
643 REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
644 }
645 }
646
647 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
648 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ExternallyAssignedBuffer,reporter,context,testContext)649 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ExternallyAssignedBuffer,
650 reporter,
651 context,
652 testContext) {
653 constexpr uint32_t kProblemSize = 512;
654 constexpr float kFactor = 4.f;
655
656 // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
657 // processes 1 vector at a time.
658 constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
659
660 std::unique_ptr<Recorder> recorder = context->makeRecorder();
661
662 class TestComputeStep : public ComputeStep {
663 public:
664 TestComputeStep() : ComputeStep(
665 /*name=*/"ExternallyAssignedBuffer",
666 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
667 /*resources=*/{
668 // Input buffer:
669 {
670 /*type=*/ResourceType::kStorageBuffer,
671 /*flow=*/DataFlow::kPrivate,
672 /*policy=*/ResourcePolicy::kMapped,
673 /*sksl=*/"inputBlock {\n"
674 " float factor;\n"
675 " layout(offset = 16) float4 in_data[];\n"
676 "}\n",
677 },
678 // Output buffer:
679 {
680 /*type=*/ResourceType::kStorageBuffer,
681 /*flow=*/DataFlow::kShared, // shared to allow us to access it from the
682 // Builder
683 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
684 /*slot=*/0,
685 /*sksl=*/"outputBlock { float4 out_data[]; }",
686 }
687 }) {}
688 ~TestComputeStep() override = default;
689
690 // A kernel that multiplies a large array of floats by a supplied factor.
691 std::string computeSkSL() const override {
692 return R"(
693 void main() {
694 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
695 }
696 )";
697 }
698
699 size_t calculateBufferSize(int resourceIndex, const ResourceDesc& r) const override {
700 SkASSERT(resourceIndex == 0);
701 SkASSERT(r.fFlow == DataFlow::kPrivate);
702 return sizeof(float) * (kProblemSize + 4);
703 }
704
705 void prepareStorageBuffer(int resourceIndex,
706 const ResourceDesc& r,
707 void* buffer,
708 size_t bufferSize) const override {
709 SkASSERT(resourceIndex == 0);
710 SkASSERT(r.fFlow == DataFlow::kPrivate);
711
712 size_t dataCount = sizeof(float) * (kProblemSize + 4);
713 SkASSERT(bufferSize == dataCount);
714 SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
715 inData[0] = kFactor;
716 for (unsigned int i = 0; i < kProblemSize; ++i) {
717 inData[i + 4] = i + 1;
718 }
719 }
720 } step;
721
722 // We allocate a buffer and directly assign it to the DispatchGroup::Builder. The ComputeStep
723 // will not participate in the creation of this buffer.
724 auto [_, outputInfo] =
725 recorder->priv().drawBufferManager()->getStoragePointer(sizeof(float) * kProblemSize);
726 REPORTER_ASSERT(reporter, outputInfo, "Failed to allocate output buffer");
727
728 DispatchGroup::Builder builder(recorder.get());
729 builder.assignSharedBuffer(outputInfo, 0);
730
731 // Initialize the step with a pre-determined global size
732 if (!builder.appendStep(&step, {WorkgroupSize(1, 1, 1)})) {
733 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
734 return;
735 }
736
737 // Record the compute task
738 ComputeTask::DispatchGroupList groups;
739 groups.push_back(builder.finalize());
740 recorder->priv().add(ComputeTask::Make(std::move(groups)));
741
742 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
743 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
744
745 // Submit the work and wait for it to complete.
746 std::unique_ptr<Recording> recording = recorder->snap();
747 if (!recording) {
748 ERRORF(reporter, "Failed to make recording");
749 return;
750 }
751
752 InsertRecordingInfo insertInfo;
753 insertInfo.fRecording = recording.get();
754 context->insertRecording(insertInfo);
755 testContext->syncedSubmit(context);
756
757 // Verify the contents of the output buffer.
758 float* outData = static_cast<float*>(
759 map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
760 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
761 for (unsigned int i = 0; i < kProblemSize; ++i) {
762 const float expected = (i + 1) * kFactor;
763 const float found = outData[i];
764 REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
765 }
766 }
767
768 // Tests the storage texture binding for a compute dispatch that writes the same color to every
769 // pixel of a storage texture.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTexture,reporter,context,testContext)770 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTexture,
771 reporter,
772 context,
773 testContext) {
774 std::unique_ptr<Recorder> recorder = context->makeRecorder();
775
776 // For this test we allocate a 8x8 tile which is written to by a single workgroup of the same
777 // size.
778 constexpr uint32_t kDim = 8;
779
780 class TestComputeStep : public ComputeStep {
781 public:
782 TestComputeStep() : ComputeStep(
783 /*name=*/"TestStorageTexture",
784 /*localDispatchSize=*/{kDim, kDim, 1},
785 /*resources=*/{
786 {
787 /*type=*/ResourceType::kWriteOnlyStorageTexture,
788 /*flow=*/DataFlow::kShared,
789 /*policy=*/ResourcePolicy::kNone,
790 /*slot=*/0,
791 /*sksl=*/"dst",
792 }
793 }) {}
794 ~TestComputeStep() override = default;
795
796 std::string computeSkSL() const override {
797 return R"(
798 void main() {
799 textureWrite(dst, sk_LocalInvocationID.xy, half4(0.0, 1.0, 0.0, 1.0));
800 }
801 )";
802 }
803
804 std::tuple<SkISize, SkColorType> calculateTextureParameters(
805 int index, const ResourceDesc& r) const override {
806 return {{kDim, kDim}, kRGBA_8888_SkColorType};
807 }
808
809 WorkgroupSize calculateGlobalDispatchSize() const override {
810 return WorkgroupSize(1, 1, 1);
811 }
812 } step;
813
814 DispatchGroup::Builder builder(recorder.get());
815 if (!builder.appendStep(&step)) {
816 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
817 return;
818 }
819
820 sk_sp<TextureProxy> texture = builder.getSharedTextureResource(0);
821 if (!texture) {
822 ERRORF(reporter, "Shared resource at slot 0 is missing");
823 return;
824 }
825
826 // Record the compute task
827 ComputeTask::DispatchGroupList groups;
828 groups.push_back(builder.finalize());
829 recorder->priv().add(ComputeTask::Make(std::move(groups)));
830
831 // Submit the work and wait for it to complete.
832 std::unique_ptr<Recording> recording = recorder->snap();
833 if (!recording) {
834 ERRORF(reporter, "Failed to make recording");
835 return;
836 }
837
838 InsertRecordingInfo insertInfo;
839 insertInfo.fRecording = recording.get();
840 context->insertRecording(insertInfo);
841 testContext->syncedSubmit(context);
842
843 SkBitmap bitmap;
844 SkImageInfo imgInfo =
845 SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
846 bitmap.allocPixels(imgInfo);
847
848 SkPixmap pixels;
849 bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
850 REPORTER_ASSERT(reporter, peekPixelsSuccess);
851
852 bool readPixelsSuccess = context->priv().readPixels(pixels, texture.get(), imgInfo, 0, 0);
853 REPORTER_ASSERT(reporter, readPixelsSuccess);
854
855 for (uint32_t x = 0; x < kDim; ++x) {
856 for (uint32_t y = 0; y < kDim; ++y) {
857 SkColor4f expected = SkColor4f::FromColor(SK_ColorGREEN);
858 SkColor4f color = pixels.getColor4f(x, y);
859 REPORTER_ASSERT(reporter, expected == color,
860 "At position {%u, %u}, "
861 "expected {%.1f, %.1f, %.1f, %.1f}, "
862 "found {%.1f, %.1f, %.1f, %.1f}",
863 x, y,
864 expected.fR, expected.fG, expected.fB, expected.fA,
865 color.fR, color.fG, color.fB, color.fA);
866 }
867 }
868 }
869
870 // Tests the readonly texture binding for a compute dispatch that random-access reads from a
871 // CPU-populated texture and copies it to a storage texture.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureReadAndWrite,reporter,context,testContext)872 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureReadAndWrite,
873 reporter,
874 context,
875 testContext) {
876 std::unique_ptr<Recorder> recorder = context->makeRecorder();
877
878 // For this test we allocate a 8x8 tile which is written to by a single workgroup of the same
879 // size.
880 constexpr uint32_t kDim = 8;
881
882 class TestComputeStep : public ComputeStep {
883 public:
884 TestComputeStep() : ComputeStep(
885 /*name=*/"TestStorageTextureReadAndWrite",
886 /*localDispatchSize=*/{kDim, kDim, 1},
887 /*resources=*/{
888 {
889 /*type=*/ResourceType::kReadOnlyTexture,
890 /*flow=*/DataFlow::kShared,
891 /*policy=*/ResourcePolicy::kNone,
892 /*slot=*/0,
893 /*sksl=*/"src",
894 },
895 {
896 /*type=*/ResourceType::kWriteOnlyStorageTexture,
897 /*flow=*/DataFlow::kShared,
898 /*policy=*/ResourcePolicy::kNone,
899 /*slot=*/1,
900 /*sksl=*/"dst",
901 }
902 }) {}
903 ~TestComputeStep() override = default;
904
905 std::string computeSkSL() const override {
906 return R"(
907 void main() {
908 half4 color = textureRead(src, sk_LocalInvocationID.xy);
909 textureWrite(dst, sk_LocalInvocationID.xy, color);
910 }
911 )";
912 }
913
914 std::tuple<SkISize, SkColorType> calculateTextureParameters(
915 int index, const ResourceDesc& r) const override {
916 SkASSERT(index == 1);
917 return {{kDim, kDim}, kRGBA_8888_SkColorType};
918 }
919
920 WorkgroupSize calculateGlobalDispatchSize() const override {
921 return WorkgroupSize(1, 1, 1);
922 }
923 } step;
924
925 // Create and populate an input texture.
926 SkBitmap srcBitmap;
927 SkImageInfo srcInfo =
928 SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
929 srcBitmap.allocPixels(srcInfo);
930 SkPixmap srcPixels;
931 bool srcPeekPixelsSuccess = srcBitmap.peekPixels(&srcPixels);
932 REPORTER_ASSERT(reporter, srcPeekPixelsSuccess);
933 for (uint32_t x = 0; x < kDim; ++x) {
934 for (uint32_t y = 0; y < kDim; ++y) {
935 *srcPixels.writable_addr32(x, y) =
936 SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0);
937 }
938 }
939
940 auto texInfo = context->priv().caps()->getDefaultSampledTextureInfo(kRGBA_8888_SkColorType,
941 skgpu::Mipmapped::kNo,
942 skgpu::Protected::kNo,
943 skgpu::Renderable::kNo);
944 sk_sp<TextureProxy> srcProxy = TextureProxy::Make(context->priv().caps(),
945 recorder->priv().resourceProvider(),
946 {kDim, kDim},
947 texInfo,
948 "ComputeTestSrcProxy",
949 skgpu::Budgeted::kNo);
950 MipLevel mipLevel;
951 mipLevel.fPixels = srcPixels.addr();
952 mipLevel.fRowBytes = srcPixels.rowBytes();
953 UploadInstance upload = UploadInstance::Make(recorder.get(),
954 srcProxy,
955 srcPixels.info().colorInfo(),
956 srcPixels.info().colorInfo(),
957 {mipLevel},
958 SkIRect::MakeWH(kDim, kDim),
959 std::make_unique<ImageUploadContext>());
960 if (!upload.isValid()) {
961 ERRORF(reporter, "Could not create UploadInstance");
962 return;
963 }
964 recorder->priv().add(UploadTask::Make(std::move(upload)));
965
966 DispatchGroup::Builder builder(recorder.get());
967
968 // Assign the input texture to slot 0. This corresponds to the ComputeStep's "src" texture
969 // binding.
970 builder.assignSharedTexture(std::move(srcProxy), 0);
971
972 if (!builder.appendStep(&step)) {
973 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
974 return;
975 }
976
977 sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
978 if (!dst) {
979 ERRORF(reporter, "shared resource at slot 1 is missing");
980 return;
981 }
982
983 // Record the compute task
984 ComputeTask::DispatchGroupList groups;
985 groups.push_back(builder.finalize());
986 recorder->priv().add(ComputeTask::Make(std::move(groups)));
987
988 // Submit the work and wait for it to complete.
989 std::unique_ptr<Recording> recording = recorder->snap();
990 if (!recording) {
991 ERRORF(reporter, "Failed to make recording");
992 return;
993 }
994
995 InsertRecordingInfo insertInfo;
996 insertInfo.fRecording = recording.get();
997 context->insertRecording(insertInfo);
998 testContext->syncedSubmit(context);
999
1000 SkBitmap bitmap;
1001 SkImageInfo imgInfo =
1002 SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1003 bitmap.allocPixels(imgInfo);
1004
1005 SkPixmap pixels;
1006 bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1007 REPORTER_ASSERT(reporter, peekPixelsSuccess);
1008
1009 bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1010 REPORTER_ASSERT(reporter, readPixelsSuccess);
1011
1012 for (uint32_t x = 0; x < kDim; ++x) {
1013 for (uint32_t y = 0; y < kDim; ++y) {
1014 SkColor4f expected = SkColor4f::FromBytes_RGBA(
1015 SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0));
1016 SkColor4f color = pixels.getColor4f(x, y);
1017 REPORTER_ASSERT(reporter, expected == color,
1018 "At position {%u, %u}, "
1019 "expected {%.1f, %.1f, %.1f, %.1f}, "
1020 "found {%.1f, %.1f, %.1f, %.1f}",
1021 x, y,
1022 expected.fR, expected.fG, expected.fB, expected.fA,
1023 color.fR, color.fG, color.fB, color.fA);
1024 }
1025 }
1026 }
1027
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ReadOnlyStorageBuffer,reporter,context,testContext)1028 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ReadOnlyStorageBuffer,
1029 reporter,
1030 context,
1031 testContext) {
1032 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1033
1034 // For this test we allocate a 8x8 tile which is written to by a single workgroup of the same
1035 // size.
1036 constexpr uint32_t kDim = 8;
1037
1038 class TestComputeStep : public ComputeStep {
1039 public:
1040 TestComputeStep() : ComputeStep(
1041 /*name=*/"TestReadOnlyStorageBuffer",
1042 /*localDispatchSize=*/{kDim, kDim, 1},
1043 /*resources=*/{
1044 {
1045 /*type=*/ResourceType::kReadOnlyStorageBuffer,
1046 /*flow=*/DataFlow::kShared,
1047 /*policy=*/ResourcePolicy::kMapped,
1048 /*slot=*/0,
1049 /*sksl=*/"src { uint in_data[]; }",
1050 },
1051 {
1052 /*type=*/ResourceType::kWriteOnlyStorageTexture,
1053 /*flow=*/DataFlow::kShared,
1054 /*policy=*/ResourcePolicy::kNone,
1055 /*slot=*/1,
1056 /*sksl=*/"dst",
1057 }
1058 }) {}
1059 ~TestComputeStep() override = default;
1060
1061 std::string computeSkSL() const override {
1062 return R"(
1063 void main() {
1064 uint ix = sk_LocalInvocationID.y * 8 + sk_LocalInvocationID.x;
1065 uint value = in_data[ix];
1066 half4 splat = half4(
1067 half(value & 0xFF),
1068 half((value >> 8) & 0xFF),
1069 half((value >> 16) & 0xFF),
1070 half((value >> 24) & 0xFF)
1071 );
1072 textureWrite(dst, sk_LocalInvocationID.xy, splat / 255.0);
1073 }
1074 )";
1075 }
1076
1077 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1078 SkASSERT(index == 0);
1079 return kDim * kDim * sizeof(uint32_t);
1080 }
1081
1082 void prepareStorageBuffer(int index,
1083 const ResourceDesc&,
1084 void* buffer,
1085 size_t bufferSize) const override {
1086 SkASSERT(index == 0);
1087 SkASSERT(bufferSize == kDim * kDim * sizeof(uint32_t));
1088
1089 uint32_t* inputs = reinterpret_cast<uint32_t*>(buffer);
1090 for (uint32_t y = 0; y < kDim; ++y) {
1091 for (uint32_t x = 0; x < kDim; ++x) {
1092 uint32_t value =
1093 ((x * 256 / kDim) & 0xFF) | ((y * 256 / kDim) & 0xFF) << 8 | 255 << 24;
1094 *(inputs++) = value;
1095 }
1096 }
1097 }
1098
1099 std::tuple<SkISize, SkColorType> calculateTextureParameters(
1100 int index, const ResourceDesc& r) const override {
1101 SkASSERT(index == 1);
1102 return {{kDim, kDim}, kRGBA_8888_SkColorType};
1103 }
1104
1105 WorkgroupSize calculateGlobalDispatchSize() const override {
1106 return WorkgroupSize(1, 1, 1);
1107 }
1108 } step;
1109
1110 DispatchGroup::Builder builder(recorder.get());
1111 if (!builder.appendStep(&step)) {
1112 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
1113 return;
1114 }
1115
1116 sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
1117 if (!dst) {
1118 ERRORF(reporter, "shared resource at slot 1 is missing");
1119 return;
1120 }
1121
1122 // Record the compute task
1123 ComputeTask::DispatchGroupList groups;
1124 groups.push_back(builder.finalize());
1125 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1126
1127 // Submit the work and wait for it to complete.
1128 std::unique_ptr<Recording> recording = recorder->snap();
1129 if (!recording) {
1130 ERRORF(reporter, "Failed to make recording");
1131 return;
1132 }
1133
1134 InsertRecordingInfo insertInfo;
1135 insertInfo.fRecording = recording.get();
1136 context->insertRecording(insertInfo);
1137 testContext->syncedSubmit(context);
1138
1139 SkBitmap bitmap;
1140 SkImageInfo imgInfo =
1141 SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1142 bitmap.allocPixels(imgInfo);
1143
1144 SkPixmap pixels;
1145 bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1146 REPORTER_ASSERT(reporter, peekPixelsSuccess);
1147
1148 bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1149 REPORTER_ASSERT(reporter, readPixelsSuccess);
1150
1151 for (uint32_t x = 0; x < kDim; ++x) {
1152 for (uint32_t y = 0; y < kDim; ++y) {
1153 SkColor4f expected =
1154 SkColor4f::FromColor(SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0));
1155 SkColor4f color = pixels.getColor4f(x, y);
1156 bool pass = true;
1157 for (int i = 0; i < 4; i++) {
1158 pass &= color[i] == expected[i];
1159 }
1160 REPORTER_ASSERT(reporter, pass,
1161 "At position {%u, %u}, "
1162 "expected {%.1f, %.1f, %.1f, %.1f}, "
1163 "found {%.1f, %.1f, %.1f, %.1f}",
1164 x, y,
1165 expected.fR, expected.fG, expected.fB, expected.fA,
1166 color.fR, color.fG, color.fB, color.fA);
1167 }
1168 }
1169 }
1170
1171 // Tests that a texture written by one compute step can be sampled by a subsequent step.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureMultipleComputeSteps,reporter,context,testContext)1172 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureMultipleComputeSteps,
1173 reporter,
1174 context,
1175 testContext) {
1176 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1177
1178 // For this test we allocate a 8x8 tile which is written to by a single workgroup of the same
1179 // size.
1180 constexpr uint32_t kDim = 8;
1181
1182 // Writes to a texture in slot 0.
1183 class TestComputeStep1 : public ComputeStep {
1184 public:
1185 TestComputeStep1() : ComputeStep(
1186 /*name=*/"TestStorageTexturesFirstPass",
1187 /*localDispatchSize=*/{kDim, kDim, 1},
1188 /*resources=*/{
1189 {
1190 /*type=*/ResourceType::kWriteOnlyStorageTexture,
1191 /*flow=*/DataFlow::kShared,
1192 /*policy=*/ResourcePolicy::kNone,
1193 /*slot=*/0,
1194 /*sksl=*/"dst",
1195 }
1196 }) {}
1197 ~TestComputeStep1() override = default;
1198
1199 std::string computeSkSL() const override {
1200 return R"(
1201 void main() {
1202 textureWrite(dst, sk_LocalInvocationID.xy, half4(0.0, 1.0, 0.0, 1.0));
1203 }
1204 )";
1205 }
1206
1207 std::tuple<SkISize, SkColorType> calculateTextureParameters(
1208 int index, const ResourceDesc& r) const override {
1209 SkASSERT(index == 0);
1210 return {{kDim, kDim}, kRGBA_8888_SkColorType};
1211 }
1212
1213 WorkgroupSize calculateGlobalDispatchSize() const override {
1214 return WorkgroupSize(1, 1, 1);
1215 }
1216 } step1;
1217
1218 // Reads from the texture in slot 0 and writes it to another texture in slot 1.
1219 class TestComputeStep2 : public ComputeStep {
1220 public:
1221 TestComputeStep2() : ComputeStep(
1222 /*name=*/"TestStorageTexturesSecondPass",
1223 /*localDispatchSize=*/{kDim, kDim, 1},
1224 /*resources=*/{
1225 {
1226 /*type=*/ResourceType::kReadOnlyTexture,
1227 /*flow=*/DataFlow::kShared,
1228 /*policy=*/ResourcePolicy::kNone,
1229 /*slot=*/0,
1230 /*sksl=*/"src",
1231 },
1232 {
1233 /*type=*/ResourceType::kWriteOnlyStorageTexture,
1234 /*flow=*/DataFlow::kShared,
1235 /*policy=*/ResourcePolicy::kNone,
1236 /*slot=*/1,
1237 /*sksl=*/"dst",
1238 }
1239 }) {}
1240 ~TestComputeStep2() override = default;
1241
1242 std::string computeSkSL() const override {
1243 return R"(
1244 void main() {
1245 half4 color = textureRead(src, sk_LocalInvocationID.xy);
1246 textureWrite(dst, sk_LocalInvocationID.xy, color);
1247 }
1248 )";
1249 }
1250
1251 std::tuple<SkISize, SkColorType> calculateTextureParameters(
1252 int index, const ResourceDesc& r) const override {
1253 SkASSERT(index == 1);
1254 return {{kDim, kDim}, kRGBA_8888_SkColorType};
1255 }
1256
1257 WorkgroupSize calculateGlobalDispatchSize() const override {
1258 return WorkgroupSize(1, 1, 1);
1259 }
1260 } step2;
1261
1262 DispatchGroup::Builder builder(recorder.get());
1263 builder.appendStep(&step1);
1264 builder.appendStep(&step2);
1265
1266 sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
1267 if (!dst) {
1268 ERRORF(reporter, "shared resource at slot 1 is missing");
1269 return;
1270 }
1271
1272 // Record the compute task
1273 ComputeTask::DispatchGroupList groups;
1274 groups.push_back(builder.finalize());
1275 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1276
1277 // Submit the work and wait for it to complete.
1278 std::unique_ptr<Recording> recording = recorder->snap();
1279 if (!recording) {
1280 ERRORF(reporter, "Failed to make recording");
1281 return;
1282 }
1283
1284 InsertRecordingInfo insertInfo;
1285 insertInfo.fRecording = recording.get();
1286 context->insertRecording(insertInfo);
1287 testContext->syncedSubmit(context);
1288
1289 SkBitmap bitmap;
1290 SkImageInfo imgInfo =
1291 SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1292 bitmap.allocPixels(imgInfo);
1293
1294 SkPixmap pixels;
1295 bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1296 REPORTER_ASSERT(reporter, peekPixelsSuccess);
1297
1298 bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1299 REPORTER_ASSERT(reporter, readPixelsSuccess);
1300
1301 for (uint32_t x = 0; x < kDim; ++x) {
1302 for (uint32_t y = 0; y < kDim; ++y) {
1303 SkColor4f expected = SkColor4f::FromColor(SK_ColorGREEN);
1304 SkColor4f color = pixels.getColor4f(x, y);
1305 REPORTER_ASSERT(reporter, expected == color,
1306 "At position {%u, %u}, "
1307 "expected {%.1f, %.1f, %.1f, %.1f}, "
1308 "found {%.1f, %.1f, %.1f, %.1f}",
1309 x, y,
1310 expected.fR, expected.fG, expected.fB, expected.fA,
1311 color.fR, color.fG, color.fB, color.fA);
1312 }
1313 }
1314 }
1315
1316 // Tests that a texture can be sampled by a compute step using a sampler.
1317 // TODO(armansito): Once the previous TODO is done, add additional tests that exercise mixed use of
1318 // texture, buffer, and sampler bindings.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SampledTexture,reporter,context,testContext)1319 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SampledTexture,
1320 reporter,
1321 context,
1322 testContext) {
1323 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1324
1325 // The first ComputeStep initializes a 8x8 texture with a checkerboard pattern of alternating
1326 // red and black pixels. The second ComputeStep downsamples this texture into a 4x4 using
1327 // bilinear filtering at pixel borders, intentionally averaging the values of each 4x4 tile in
1328 // the source texture, and writes the result to the destination texture.
1329 constexpr uint32_t kSrcDim = 8;
1330 constexpr uint32_t kDstDim = 4;
1331
1332 class TestComputeStep1 : public ComputeStep {
1333 public:
1334 TestComputeStep1() : ComputeStep(
1335 /*name=*/"Test_SampledTexture_Init",
1336 /*localDispatchSize=*/{kSrcDim, kSrcDim, 1},
1337 /*resources=*/{
1338 {
1339 /*type=*/ResourceType::kWriteOnlyStorageTexture,
1340 /*flow=*/DataFlow::kShared,
1341 /*policy=*/ResourcePolicy::kNone,
1342 /*slot=*/0,
1343 /*sksl=*/"dst",
1344 }
1345 }) {}
1346 ~TestComputeStep1() override = default;
1347
1348 std::string computeSkSL() const override {
1349 return R"(
1350 void main() {
1351 uint2 c = sk_LocalInvocationID.xy;
1352 uint checkerBoardColor = (c.x + (c.y % 2)) % 2;
1353 textureWrite(dst, c, half4(checkerBoardColor, 0, 0, 1));
1354 }
1355 )";
1356 }
1357
1358 std::tuple<SkISize, SkColorType> calculateTextureParameters(
1359 int index, const ResourceDesc& r) const override {
1360 SkASSERT(index == 0);
1361 return {{kSrcDim, kSrcDim}, kRGBA_8888_SkColorType};
1362 }
1363
1364 WorkgroupSize calculateGlobalDispatchSize() const override {
1365 return WorkgroupSize(1, 1, 1);
1366 }
1367 } step1;
1368
1369 class TestComputeStep2 : public ComputeStep {
1370 public:
1371 TestComputeStep2() : ComputeStep(
1372 /*name=*/"Test_SampledTexture_Sample",
1373 /*localDispatchSize=*/{kDstDim, kDstDim, 1},
1374 /*resources=*/{
1375 // Declare the storage texture before the sampled texture. This tests that
1376 // binding index assignment works consistently across all backends when a
1377 // sampler-less texture and a texture+sampler pair are intermixed and sampler
1378 // bindings aren't necessarily contiguous when the ranges are distinct.
1379 {
1380 /*type=*/ResourceType::kWriteOnlyStorageTexture,
1381 /*flow=*/DataFlow::kShared,
1382 /*policy=*/ResourcePolicy::kNone,
1383 /*slot=*/1,
1384 /*sksl=*/"dst",
1385 },
1386 {
1387 /*type=*/ResourceType::kSampledTexture,
1388 /*flow=*/DataFlow::kShared,
1389 /*policy=*/ResourcePolicy::kNone,
1390 /*slot=*/0,
1391 /*sksl=*/"src",
1392 }
1393 }) {}
1394 ~TestComputeStep2() override = default;
1395
1396 std::string computeSkSL() const override {
1397 return R"(
1398 void main() {
1399 // Normalize the 4x4 invocation indices and sample the source texture using
1400 // that.
1401 uint2 dstCoord = sk_LocalInvocationID.xy;
1402 const float2 dstSizeInv = float2(0.25, 0.25);
1403 float2 unormCoord = float2(dstCoord) * dstSizeInv;
1404
1405 // Use explicit LOD, as quad derivatives are not available to a compute shader.
1406 half4 color = sampleLod(src, unormCoord, 0);
1407 textureWrite(dst, dstCoord, color);
1408 }
1409 )";
1410 }
1411
1412 std::tuple<SkISize, SkColorType> calculateTextureParameters(
1413 int index, const ResourceDesc& r) const override {
1414 SkASSERT(index == 0 || index == 1);
1415 return {{kDstDim, kDstDim}, kRGBA_8888_SkColorType};
1416 }
1417
1418 SamplerDesc calculateSamplerParameters(int index, const ResourceDesc&) const override {
1419 SkASSERT(index == 1);
1420 // Use the repeat tile mode to sample an infinite checkerboard.
1421 return {SkFilterMode::kLinear, SkTileMode::kRepeat};
1422 }
1423
1424 WorkgroupSize calculateGlobalDispatchSize() const override {
1425 return WorkgroupSize(1, 1, 1);
1426 }
1427 } step2;
1428
1429 DispatchGroup::Builder builder(recorder.get());
1430 builder.appendStep(&step1);
1431 builder.appendStep(&step2);
1432
1433 sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
1434 if (!dst) {
1435 ERRORF(reporter, "shared resource at slot 1 is missing");
1436 return;
1437 }
1438
1439 // Record the compute task
1440 ComputeTask::DispatchGroupList groups;
1441 groups.push_back(builder.finalize());
1442 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1443
1444 // Submit the work and wait for it to complete.
1445 std::unique_ptr<Recording> recording = recorder->snap();
1446 if (!recording) {
1447 ERRORF(reporter, "Failed to make recording");
1448 return;
1449 }
1450
1451 InsertRecordingInfo insertInfo;
1452 insertInfo.fRecording = recording.get();
1453 context->insertRecording(insertInfo);
1454 testContext->syncedSubmit(context);
1455
1456 SkBitmap bitmap;
1457 SkImageInfo imgInfo =
1458 SkImageInfo::Make(kDstDim, kDstDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1459 bitmap.allocPixels(imgInfo);
1460
1461 SkPixmap pixels;
1462 bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1463 REPORTER_ASSERT(reporter, peekPixelsSuccess);
1464
1465 bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1466 REPORTER_ASSERT(reporter, readPixelsSuccess);
1467
1468 for (uint32_t x = 0; x < kDstDim; ++x) {
1469 for (uint32_t y = 0; y < kDstDim; ++y) {
1470 SkColor4f color = pixels.getColor4f(x, y);
1471 REPORTER_ASSERT(reporter, color.fR > 0.49 && color.fR < 0.51,
1472 "At position {%u, %u}, "
1473 "expected red channel in range [0.49, 0.51], "
1474 "found {%.3f}",
1475 x, y, color.fR);
1476 }
1477 }
1478 }
1479
1480 // TODO(b/260622403): The shader tested here is identical to
1481 // `resources/sksl/compute/AtomicsOperations.compute`. It would be nice to be able to exercise SkSL
1482 // features like this as part of SkSLTest.cpp instead of as a graphite test.
1483 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
1484 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsTest,reporter,context,testContext)1485 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsTest,
1486 reporter,
1487 context,
1488 testContext) {
1489 // This fails on Dawn D3D11, b/315834710
1490 if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
1491 return;
1492 }
1493
1494 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1495
1496 constexpr uint32_t kWorkgroupCount = 32;
1497 constexpr uint32_t kWorkgroupSize = 128;
1498
1499 class TestComputeStep : public ComputeStep {
1500 public:
1501 TestComputeStep() : ComputeStep(
1502 /*name=*/"TestAtomicOperations",
1503 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1504 /*resources=*/{
1505 {
1506 /*type=*/ResourceType::kStorageBuffer,
1507 /*flow=*/DataFlow::kShared,
1508 /*policy=*/ResourcePolicy::kMapped,
1509 /*slot=*/0,
1510 /*sksl=*/"ssbo { atomicUint globalCounter; }",
1511 }
1512 }) {}
1513 ~TestComputeStep() override = default;
1514
1515 // A kernel that increments a global (device memory) counter across multiple workgroups.
1516 // Each workgroup maintains its own independent tally in a workgroup-shared counter which
1517 // is then added to the global count.
1518 //
1519 // This exercises atomic store/load/add and coherent reads and writes over memory in storage
1520 // and workgroup address spaces.
1521 std::string computeSkSL() const override {
1522 return R"(
1523 workgroup atomicUint localCounter;
1524
1525 void main() {
1526 // Initialize the local counter.
1527 if (sk_LocalInvocationID.x == 0) {
1528 atomicStore(localCounter, 0);
1529 }
1530
1531 // Synchronize the threads in the workgroup so they all see the initial value.
1532 workgroupBarrier();
1533
1534 // All threads increment the counter.
1535 atomicAdd(localCounter, 1);
1536
1537 // Synchronize the threads again to ensure they have all executed the increment
1538 // and the following load reads the same value across all threads in the
1539 // workgroup.
1540 workgroupBarrier();
1541
1542 // Add the workgroup-only tally to the global counter.
1543 if (sk_LocalInvocationID.x == 0) {
1544 atomicAdd(globalCounter, atomicLoad(localCounter));
1545 }
1546 }
1547 )";
1548 }
1549
1550 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1551 SkASSERT(index == 0);
1552 SkASSERT(r.fSlot == 0);
1553 SkASSERT(r.fFlow == DataFlow::kShared);
1554 return sizeof(uint32_t);
1555 }
1556
1557 WorkgroupSize calculateGlobalDispatchSize() const override {
1558 return WorkgroupSize(kWorkgroupCount, 1, 1);
1559 }
1560
1561 void prepareStorageBuffer(int resourceIndex,
1562 const ResourceDesc& r,
1563 void* buffer,
1564 size_t bufferSize) const override {
1565 SkASSERT(resourceIndex == 0);
1566 *static_cast<uint32_t*>(buffer) = 0;
1567 }
1568 } step;
1569
1570 DispatchGroup::Builder builder(recorder.get());
1571 builder.appendStep(&step);
1572
1573 BindBufferInfo info = builder.getSharedBufferResource(0);
1574 if (!info) {
1575 ERRORF(reporter, "shared resource at slot 0 is missing");
1576 return;
1577 }
1578
1579 // Record the compute pass task.
1580 ComputeTask::DispatchGroupList groups;
1581 groups.push_back(builder.finalize());
1582 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1583
1584 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1585 auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
1586
1587 // Submit the work and wait for it to complete.
1588 std::unique_ptr<Recording> recording = recorder->snap();
1589 if (!recording) {
1590 ERRORF(reporter, "Failed to make recording");
1591 return;
1592 }
1593
1594 InsertRecordingInfo insertInfo;
1595 insertInfo.fRecording = recording.get();
1596 context->insertRecording(insertInfo);
1597 testContext->syncedSubmit(context);
1598
1599 // Verify the contents of the output buffer.
1600 constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
1601 const uint32_t result = static_cast<const uint32_t*>(
1602 map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
1603 REPORTER_ASSERT(reporter,
1604 result == kExpectedCount,
1605 "expected '%u', found '%u'",
1606 kExpectedCount,
1607 result);
1608 }
1609
1610 // TODO(b/260622403): The shader tested here is identical to
1611 // `resources/sksl/compute/AtomicsOperationsOverArrayAndStruct.compute`. It would be nice to be able
1612 // to exercise SkSL features like this as part of SkSLTest.cpp instead of as a graphite test.
1613 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
1614 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsOverArrayAndStructTest,reporter,context,testContext)1615 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsOverArrayAndStructTest,
1616 reporter,
1617 context,
1618 testContext) {
1619 // This fails on Dawn D3D11, b/315834710
1620 if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
1621 return;
1622 }
1623
1624 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1625
1626 constexpr uint32_t kWorkgroupCount = 32;
1627 constexpr uint32_t kWorkgroupSize = 128;
1628
1629 class TestComputeStep : public ComputeStep {
1630 public:
1631 TestComputeStep() : ComputeStep(
1632 /*name=*/"TestAtomicOperationsOverArrayAndStruct",
1633 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1634 /*resources=*/{
1635 {
1636 /*type=*/ResourceType::kStorageBuffer,
1637 /*flow=*/DataFlow::kShared,
1638 /*policy=*/ResourcePolicy::kMapped,
1639 /*slot=*/0,
1640 /*sksl=*/"ssbo {\n"
1641 " atomicUint globalCountsFirstHalf;\n"
1642 " atomicUint globalCountsSecondHalf;\n"
1643 "}\n"
1644 }
1645 }) {}
1646 ~TestComputeStep() override = default;
1647
1648 // Construct a kernel that increments a two global (device memory) counters across multiple
1649 // workgroups. Each workgroup maintains its own independent tallies in workgroup-shared
1650 // counters which are then added to the global counts.
1651 //
1652 // This exercises atomic store/load/add and coherent reads and writes over memory in storage
1653 // and workgroup address spaces.
1654 std::string computeSkSL() const override {
1655 return R"(
1656 const uint WORKGROUP_SIZE = 128;
1657
1658 workgroup atomicUint localCounts[2];
1659
1660 void main() {
1661 // Initialize the local counts.
1662 if (sk_LocalInvocationID.x == 0) {
1663 atomicStore(localCounts[0], 0);
1664 atomicStore(localCounts[1], 0);
1665 }
1666
1667 // Synchronize the threads in the workgroup so they all see the initial value.
1668 workgroupBarrier();
1669
1670 // Each thread increments one of the local counters based on its invocation
1671 // index.
1672 uint idx = sk_LocalInvocationID.x < (WORKGROUP_SIZE / 2) ? 0 : 1;
1673 atomicAdd(localCounts[idx], 1);
1674
1675 // Synchronize the threads again to ensure they have all executed the increments
1676 // and the following load reads the same value across all threads in the
1677 // workgroup.
1678 workgroupBarrier();
1679
1680 // Add the workgroup-only tally to the global counter.
1681 if (sk_LocalInvocationID.x == 0) {
1682 atomicAdd(globalCountsFirstHalf, atomicLoad(localCounts[0]));
1683 atomicAdd(globalCountsSecondHalf, atomicLoad(localCounts[1]));
1684 }
1685 }
1686 )";
1687 }
1688
1689 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1690 SkASSERT(index == 0);
1691 SkASSERT(r.fSlot == 0);
1692 SkASSERT(r.fFlow == DataFlow::kShared);
1693 return 2 * sizeof(uint32_t);
1694 }
1695
1696 WorkgroupSize calculateGlobalDispatchSize() const override {
1697 return WorkgroupSize(kWorkgroupCount, 1, 1);
1698 }
1699
1700 void prepareStorageBuffer(int resourceIndex,
1701 const ResourceDesc& r,
1702 void* buffer,
1703 size_t bufferSize) const override {
1704 SkASSERT(resourceIndex == 0);
1705 uint32_t* data = static_cast<uint32_t*>(buffer);
1706 data[0] = 0;
1707 data[1] = 0;
1708 }
1709 } step;
1710
1711 DispatchGroup::Builder builder(recorder.get());
1712 builder.appendStep(&step);
1713
1714 BindBufferInfo info = builder.getSharedBufferResource(0);
1715 if (!info) {
1716 ERRORF(reporter, "shared resource at slot 0 is missing");
1717 return;
1718 }
1719
1720 // Record the compute pass task.
1721 ComputeTask::DispatchGroupList groups;
1722 groups.push_back(builder.finalize());
1723 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1724
1725 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1726 auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
1727
1728 // Submit the work and wait for it to complete.
1729 std::unique_ptr<Recording> recording = recorder->snap();
1730 if (!recording) {
1731 ERRORF(reporter, "Failed to make recording");
1732 return;
1733 }
1734
1735 InsertRecordingInfo insertInfo;
1736 insertInfo.fRecording = recording.get();
1737 context->insertRecording(insertInfo);
1738 testContext->syncedSubmit(context);
1739
1740 // Verify the contents of the output buffer.
1741 constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize / 2;
1742
1743 const uint32_t* ssboData = static_cast<const uint32_t*>(
1744 map_buffer(context, testContext, buffer.get(), info.fOffset));
1745 const uint32_t firstHalfCount = ssboData[0];
1746 const uint32_t secondHalfCount = ssboData[1];
1747 REPORTER_ASSERT(reporter,
1748 firstHalfCount == kExpectedCount,
1749 "expected '%u', found '%u'",
1750 kExpectedCount,
1751 firstHalfCount);
1752 REPORTER_ASSERT(reporter,
1753 secondHalfCount == kExpectedCount,
1754 "expected '%u', found '%u'",
1755 kExpectedCount,
1756 secondHalfCount);
1757 }
1758
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearedBuffer,reporter,context,testContext)1759 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearedBuffer,
1760 reporter,
1761 context,
1762 testContext) {
1763 constexpr uint32_t kProblemSize = 512;
1764
1765 // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
1766 // processes 1 vector at a time.
1767 constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
1768
1769 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1770
1771 // The ComputeStep requests an unmapped buffer that is zero-initialized. It writes the output to
1772 // a mapped buffer which test verifies.
1773 class TestComputeStep : public ComputeStep {
1774 public:
1775 TestComputeStep() : ComputeStep(
1776 /*name=*/"TestClearedBuffer",
1777 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1778 /*resources=*/{
1779 // Zero initialized input buffer
1780 {
1781 // TODO(b/299979165): Declare this binding as read-only.
1782 /*type=*/ResourceType::kStorageBuffer,
1783 /*flow=*/DataFlow::kPrivate,
1784 /*policy=*/ResourcePolicy::kClear,
1785 /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
1786 },
1787 // Output buffer:
1788 {
1789 /*type=*/ResourceType::kStorageBuffer,
1790 /*flow=*/DataFlow::kShared, // shared to allow us to access it from the
1791 // Builder
1792 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
1793 /*slot=*/0,
1794 /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
1795 }
1796 }) {}
1797 ~TestComputeStep() override = default;
1798
1799 std::string computeSkSL() const override {
1800 return R"(
1801 void main() {
1802 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
1803 }
1804 )";
1805 }
1806
1807 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1808 return sizeof(uint32_t) * kProblemSize;
1809 }
1810
1811 void prepareStorageBuffer(int resourceIndex,
1812 const ResourceDesc& r,
1813 void* buffer,
1814 size_t bufferSize) const override {
1815 // Should receive this call only for the mapped buffer.
1816 SkASSERT(resourceIndex == 1);
1817 }
1818
1819 WorkgroupSize calculateGlobalDispatchSize() const override {
1820 return WorkgroupSize(1, 1, 1);
1821 }
1822 } step;
1823
1824 DispatchGroup::Builder builder(recorder.get());
1825 if (!builder.appendStep(&step)) {
1826 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
1827 return;
1828 }
1829
1830 // The output buffer should have been placed in the right output slot.
1831 BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
1832 if (!outputInfo) {
1833 ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
1834 return;
1835 }
1836
1837 // Record the compute task
1838 ComputeTask::DispatchGroupList groups;
1839 groups.push_back(builder.finalize());
1840 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1841
1842 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1843 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
1844
1845 // Submit the work and wait for it to complete.
1846 std::unique_ptr<Recording> recording = recorder->snap();
1847 if (!recording) {
1848 ERRORF(reporter, "Failed to make recording");
1849 return;
1850 }
1851
1852 InsertRecordingInfo insertInfo;
1853 insertInfo.fRecording = recording.get();
1854 context->insertRecording(insertInfo);
1855 testContext->syncedSubmit(context);
1856
1857 // Verify the contents of the output buffer.
1858 uint32_t* outData = static_cast<uint32_t*>(
1859 map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
1860 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
1861 for (unsigned int i = 0; i < kProblemSize; ++i) {
1862 const uint32_t found = outData[i];
1863 REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
1864 }
1865 }
1866
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrdering,reporter,context,testContext)1867 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrdering,
1868 reporter,
1869 context,
1870 testContext) {
1871 // Initiate two independent DispatchGroups operating on the same buffer. The first group
1872 // writes garbage to the buffer and the second group copies the contents to an output buffer.
1873 // This test validates that the reads, writes, and clear occur in the expected order.
1874 constexpr uint32_t kWorkgroupSize = 64;
1875
1876 // Initialize buffer with non-zero data.
1877 class FillWithGarbage : public ComputeStep {
1878 public:
1879 FillWithGarbage() : ComputeStep(
1880 /*name=*/"FillWithGarbage",
1881 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1882 /*resources=*/{
1883 {
1884 /*type=*/ResourceType::kStorageBuffer,
1885 /*flow=*/DataFlow::kShared,
1886 /*policy=*/ResourcePolicy::kNone,
1887 /*slot=*/0,
1888 /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
1889 }
1890 }) {}
1891 ~FillWithGarbage() override = default;
1892
1893 std::string computeSkSL() const override {
1894 return R"(
1895 void main() {
1896 out_data[sk_GlobalInvocationID.x] = uint4(0xFE);
1897 }
1898 )";
1899 }
1900 } garbageStep;
1901
1902 // Second stage just copies the data to a destination buffer. This is only to verify that this
1903 // stage, issued in a separate DispatchGroup, observes the clear.
1904 class CopyBuffer : public ComputeStep {
1905 public:
1906 CopyBuffer() : ComputeStep(
1907 /*name=*/"CopyBuffer",
1908 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1909 /*resources=*/{
1910 {
1911 /*type=*/ResourceType::kStorageBuffer,
1912 /*flow=*/DataFlow::kShared,
1913 /*policy=*/ResourcePolicy::kNone,
1914 /*slot=*/0,
1915 /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
1916 },
1917 {
1918 /*type=*/ResourceType::kStorageBuffer,
1919 /*flow=*/DataFlow::kShared,
1920 /*policy=*/ResourcePolicy::kNone,
1921 /*slot=*/1,
1922 /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
1923 }
1924 }) {}
1925 ~CopyBuffer() override = default;
1926
1927 std::string computeSkSL() const override {
1928 return R"(
1929 void main() {
1930 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
1931 }
1932 )";
1933 }
1934 } copyStep;
1935
1936 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1937 DispatchGroup::Builder builder(recorder.get());
1938
1939 constexpr size_t kElementCount = 4 * kWorkgroupSize;
1940 constexpr size_t kBufferSize = sizeof(uint32_t) * kElementCount;
1941 auto input = recorder->priv().drawBufferManager()->getStorage(kBufferSize);
1942 auto [_, output] = recorder->priv().drawBufferManager()->getStoragePointer(kBufferSize);
1943
1944 ComputeTask::DispatchGroupList groups;
1945
1946 // First group.
1947 builder.assignSharedBuffer(input, 0);
1948 builder.appendStep(&garbageStep, {{1, 1, 1}});
1949 groups.push_back(builder.finalize());
1950
1951 // Second group.
1952 builder.reset();
1953 builder.assignSharedBuffer(input, 0, ClearBuffer::kYes);
1954 builder.assignSharedBuffer(output, 1);
1955 builder.appendStep(©Step, {{1, 1, 1}});
1956 groups.push_back(builder.finalize());
1957
1958 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1959 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1960 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), output.fBuffer);
1961
1962 // Submit the work and wait for it to complete.
1963 std::unique_ptr<Recording> recording = submit_recording(context, testContext, recorder.get());
1964 if (!recording) {
1965 ERRORF(reporter, "Failed to make recording");
1966 return;
1967 }
1968
1969 // Verify the contents of the output buffer.
1970 uint32_t* outData = static_cast<uint32_t*>(
1971 map_buffer(context, testContext, outputBuffer.get(), output.fOffset));
1972 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
1973 for (unsigned int i = 0; i < kElementCount; ++i) {
1974 const uint32_t found = outData[i];
1975 REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
1976 }
1977 }
1978
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrderingScratchBuffers,reporter,context,testContext)1979 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrderingScratchBuffers,
1980 reporter,
1981 context,
1982 testContext) {
1983 // This test is the same as the ClearOrdering test but the two stages write to a recycled
1984 // ScratchBuffer. This is primarily to test ScratchBuffer reuse.
1985 constexpr uint32_t kWorkgroupSize = 64;
1986
1987 // Initialize buffer with non-zero data.
1988 class FillWithGarbage : public ComputeStep {
1989 public:
1990 FillWithGarbage() : ComputeStep(
1991 /*name=*/"FillWithGarbage",
1992 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1993 /*resources=*/{
1994 {
1995 /*type=*/ResourceType::kStorageBuffer,
1996 /*flow=*/DataFlow::kShared,
1997 /*policy=*/ResourcePolicy::kNone,
1998 /*slot=*/0,
1999 /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
2000 }
2001 }) {}
2002 ~FillWithGarbage() override = default;
2003
2004 std::string computeSkSL() const override {
2005 return R"(
2006 void main() {
2007 out_data[sk_GlobalInvocationID.x] = uint4(0xFE);
2008 }
2009 )";
2010 }
2011 } garbageStep;
2012
2013 // Second stage just copies the data to a destination buffer. This is only to verify that this
2014 // stage (issued in a separate DispatchGroup) sees the changes.
2015 class CopyBuffer : public ComputeStep {
2016 public:
2017 CopyBuffer() : ComputeStep(
2018 /*name=*/"CopyBuffer",
2019 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2020 /*resources=*/{
2021 {
2022 /*type=*/ResourceType::kStorageBuffer,
2023 /*flow=*/DataFlow::kShared,
2024 /*policy=*/ResourcePolicy::kNone,
2025 /*slot=*/0,
2026 /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
2027 },
2028 {
2029 /*type=*/ResourceType::kStorageBuffer,
2030 /*flow=*/DataFlow::kShared,
2031 /*policy=*/ResourcePolicy::kNone,
2032 /*slot=*/1,
2033 /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
2034 }
2035 }) {}
2036 ~CopyBuffer() override = default;
2037
2038 std::string computeSkSL() const override {
2039 return R"(
2040 void main() {
2041 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
2042 }
2043 )";
2044 }
2045 } copyStep;
2046
2047 std::unique_ptr<Recorder> recorder = context->makeRecorder();
2048 DispatchGroup::Builder builder(recorder.get());
2049
2050 constexpr size_t kElementCount = 4 * kWorkgroupSize;
2051 constexpr size_t kBufferSize = sizeof(uint32_t) * kElementCount;
2052 auto [_, output] = recorder->priv().drawBufferManager()->getStoragePointer(kBufferSize);
2053
2054 ComputeTask::DispatchGroupList groups;
2055
2056 // First group.
2057 {
2058 auto scratch = recorder->priv().drawBufferManager()->getScratchStorage(kBufferSize);
2059 auto input = scratch.suballocate(kBufferSize);
2060 builder.assignSharedBuffer(input, 0);
2061
2062 // `scratch` returns to the scratch buffer pool when it goes out of scope
2063 }
2064 builder.appendStep(&garbageStep, {{1, 1, 1}});
2065 groups.push_back(builder.finalize());
2066
2067 // Second group.
2068 builder.reset();
2069 {
2070 auto scratch = recorder->priv().drawBufferManager()->getScratchStorage(kBufferSize);
2071 auto input = scratch.suballocate(kBufferSize);
2072 builder.assignSharedBuffer(input, 0, ClearBuffer::kYes);
2073 }
2074 builder.assignSharedBuffer(output, 1);
2075 builder.appendStep(©Step, {{1, 1, 1}});
2076 groups.push_back(builder.finalize());
2077
2078 recorder->priv().add(ComputeTask::Make(std::move(groups)));
2079 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2080 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), output.fBuffer);
2081
2082 // Submit the work and wait for it to complete.
2083 std::unique_ptr<Recording> recording = submit_recording(context, testContext, recorder.get());
2084 if (!recording) {
2085 ERRORF(reporter, "Failed to make recording");
2086 return;
2087 }
2088
2089 // Verify the contents of the output buffer.
2090 uint32_t* outData = static_cast<uint32_t*>(
2091 map_buffer(context, testContext, outputBuffer.get(), output.fOffset));
2092 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
2093 for (unsigned int i = 0; i < kElementCount; ++i) {
2094 const uint32_t found = outData[i];
2095 REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
2096 }
2097 }
2098
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_IndirectDispatch,reporter,context,testContext)2099 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_IndirectDispatch,
2100 reporter,
2101 context,
2102 testContext) {
2103 // This fails on Dawn D3D11, b/315834710
2104 if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
2105 return;
2106 }
2107
2108 std::unique_ptr<Recorder> recorder = context->makeRecorder();
2109
2110 constexpr uint32_t kWorkgroupCount = 32;
2111 constexpr uint32_t kWorkgroupSize = 64;
2112
2113 // `IndirectStep` populates a buffer with the global workgroup count for `CountStep`.
2114 // `CountStep` is recorded using `DispatchGroup::appendStepIndirect()` and its workgroups get
2115 // dispatched according to the values computed by `IndirectStep` on the GPU.
2116 class IndirectStep : public ComputeStep {
2117 public:
2118 IndirectStep()
2119 : ComputeStep(
2120 /*name=*/"TestIndirectDispatch_IndirectStep",
2121 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2122 /*resources=*/
2123 {{
2124 /*type=*/ResourceType::kIndirectBuffer,
2125 /*flow=*/DataFlow::kShared,
2126 /*policy=*/ResourcePolicy::kClear,
2127 /*slot=*/0,
2128 // TODO(armansito): Ideally the SSBO would have a single member of
2129 // type `IndirectDispatchArgs` struct type. SkSL modules don't
2130 // support struct declarations so this is currently not possible.
2131 /*sksl=*/"ssbo { uint indirect[]; }",
2132 }}) {}
2133 ~IndirectStep() override = default;
2134
2135 // Kernel that specifies a workgroup size of `kWorkgroupCount` to be used by the indirect
2136 // dispatch.
2137 std::string computeSkSL() const override {
2138 return R"(
2139 // This needs to match `kWorkgroupCount` declared above.
2140 const uint kWorkgroupCount = 32;
2141
2142 void main() {
2143 if (sk_LocalInvocationID.x == 0) {
2144 indirect[0] = kWorkgroupCount;
2145 indirect[1] = 1;
2146 indirect[2] = 1;
2147 }
2148 }
2149 )";
2150 }
2151
2152 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2153 SkASSERT(index == 0);
2154 SkASSERT(r.fSlot == 0);
2155 SkASSERT(r.fFlow == DataFlow::kShared);
2156 return kIndirectDispatchArgumentSize;
2157 }
2158
2159 WorkgroupSize calculateGlobalDispatchSize() const override {
2160 return WorkgroupSize(1, 1, 1);
2161 }
2162 } indirectStep;
2163
2164 class CountStep : public ComputeStep {
2165 public:
2166 CountStep()
2167 : ComputeStep(
2168 /*name=*/"TestIndirectDispatch_CountStep",
2169 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2170 /*resources=*/
2171 {{
2172 /*type=*/ResourceType::kStorageBuffer,
2173 /*flow=*/DataFlow::kShared,
2174 /*policy=*/ResourcePolicy::kMapped,
2175 /*slot=*/1,
2176 /*sksl=*/"ssbo { atomicUint globalCounter; }",
2177 }}) {}
2178 ~CountStep() override = default;
2179
2180 std::string computeSkSL() const override {
2181 return R"(
2182 workgroup atomicUint localCounter;
2183
2184 void main() {
2185 // Initialize the local counter.
2186 if (sk_LocalInvocationID.x == 0) {
2187 atomicStore(localCounter, 0);
2188 }
2189
2190 // Synchronize the threads in the workgroup so they all see the initial value.
2191 workgroupBarrier();
2192
2193 // All threads increment the counter.
2194 atomicAdd(localCounter, 1);
2195
2196 // Synchronize the threads again to ensure they have all executed the increment
2197 // and the following load reads the same value across all threads in the
2198 // workgroup.
2199 workgroupBarrier();
2200
2201 // Add the workgroup-only tally to the global counter.
2202 if (sk_LocalInvocationID.x == 0) {
2203 atomicAdd(globalCounter, atomicLoad(localCounter));
2204 }
2205 }
2206 )";
2207 }
2208
2209 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2210 SkASSERT(index == 0);
2211 SkASSERT(r.fSlot == 1);
2212 SkASSERT(r.fFlow == DataFlow::kShared);
2213 return sizeof(uint32_t);
2214 }
2215
2216 void prepareStorageBuffer(int resourceIndex,
2217 const ResourceDesc& r,
2218 void* buffer,
2219 size_t bufferSize) const override {
2220 SkASSERT(resourceIndex == 0);
2221 *static_cast<uint32_t*>(buffer) = 0;
2222 }
2223 } countStep;
2224
2225 DispatchGroup::Builder builder(recorder.get());
2226 builder.appendStep(&indirectStep);
2227 BindBufferInfo indirectBufferInfo = builder.getSharedBufferResource(0);
2228 if (!indirectBufferInfo) {
2229 ERRORF(reporter, "Shared resource at slot 0 is missing");
2230 return;
2231 }
2232 REPORTER_ASSERT(reporter, indirectBufferInfo.fSize == kIndirectDispatchArgumentSize);
2233 builder.appendStepIndirect(&countStep, indirectBufferInfo);
2234
2235 BindBufferInfo info = builder.getSharedBufferResource(1);
2236 if (!info) {
2237 ERRORF(reporter, "Shared resource at slot 1 is missing");
2238 return;
2239 }
2240
2241 // Record the compute pass task.
2242 ComputeTask::DispatchGroupList groups;
2243 groups.push_back(builder.finalize());
2244 recorder->priv().add(ComputeTask::Make(std::move(groups)));
2245
2246 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2247 auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2248
2249 // Submit the work and wait for it to complete.
2250 std::unique_ptr<Recording> recording = recorder->snap();
2251 if (!recording) {
2252 ERRORF(reporter, "Failed to make recording");
2253 return;
2254 }
2255
2256 InsertRecordingInfo insertInfo;
2257 insertInfo.fRecording = recording.get();
2258 context->insertRecording(insertInfo);
2259 testContext->syncedSubmit(context);
2260
2261 // Verify the contents of the output buffer.
2262 constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2263 const uint32_t result = static_cast<const uint32_t*>(
2264 map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2265 REPORTER_ASSERT(reporter,
2266 result == kExpectedCount,
2267 "expected '%u', found '%u'",
2268 kExpectedCount,
2269 result);
2270 }
2271
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal,reporter,context,testContext)2272 DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal,
2273 reporter,
2274 context,
2275 testContext) {
2276 std::unique_ptr<Recorder> recorder = context->makeRecorder();
2277
2278 constexpr uint32_t kWorkgroupCount = 32;
2279 constexpr uint32_t kWorkgroupSize = 1024;
2280
2281 class TestComputeStep : public ComputeStep {
2282 public:
2283 TestComputeStep() : ComputeStep(
2284 /*name=*/"TestAtomicOperationsMetal",
2285 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2286 /*resources=*/{
2287 {
2288 /*type=*/ResourceType::kStorageBuffer,
2289 /*flow=*/DataFlow::kShared,
2290 /*policy=*/ResourcePolicy::kMapped,
2291 /*slot=*/0,
2292 }
2293 },
2294 /*workgroupBuffers=*/{},
2295 /*baseFlags=*/Flags::kSupportsNativeShader) {}
2296 ~TestComputeStep() override = default;
2297
2298 NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
2299 SkASSERT(format == NativeShaderFormat::kMSL);
2300 static constexpr std::string_view kSource = R"(
2301 #include <metal_stdlib>
2302
2303 using namespace metal;
2304
2305 kernel void atomicCount(uint3 localId [[thread_position_in_threadgroup]],
2306 device atomic_uint& globalCounter [[buffer(0)]]) {
2307 threadgroup atomic_uint localCounter;
2308
2309 // Initialize the local counter.
2310 if (localId.x == 0u) {
2311 atomic_store_explicit(&localCounter, 0u, memory_order_relaxed);
2312 }
2313
2314 // Synchronize the threads in the workgroup so they all see the initial value.
2315 threadgroup_barrier(mem_flags::mem_threadgroup);
2316
2317 // All threads increment the counter.
2318 atomic_fetch_add_explicit(&localCounter, 1u, memory_order_relaxed);
2319
2320 // Synchronize the threads again to ensure they have all executed the increment
2321 // and the following load reads the same value across all threads in the
2322 // workgroup.
2323 threadgroup_barrier(mem_flags::mem_threadgroup);
2324
2325 // Add the workgroup-only tally to the global counter.
2326 if (localId.x == 0u) {
2327 uint tally = atomic_load_explicit(&localCounter, memory_order_relaxed);
2328 atomic_fetch_add_explicit(&globalCounter, tally, memory_order_relaxed);
2329 }
2330 }
2331 )";
2332 return {kSource, "atomicCount"};
2333 }
2334
2335 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2336 SkASSERT(index == 0);
2337 SkASSERT(r.fSlot == 0);
2338 SkASSERT(r.fFlow == DataFlow::kShared);
2339 return sizeof(uint32_t);
2340 }
2341
2342 WorkgroupSize calculateGlobalDispatchSize() const override {
2343 return WorkgroupSize(kWorkgroupCount, 1, 1);
2344 }
2345
2346 void prepareStorageBuffer(int resourceIndex,
2347 const ResourceDesc& r,
2348 void* buffer,
2349 size_t bufferSize) const override {
2350 SkASSERT(resourceIndex == 0);
2351 *static_cast<uint32_t*>(buffer) = 0;
2352 }
2353 } step;
2354
2355 DispatchGroup::Builder builder(recorder.get());
2356 builder.appendStep(&step);
2357
2358 BindBufferInfo info = builder.getSharedBufferResource(0);
2359 if (!info) {
2360 ERRORF(reporter, "shared resource at slot 0 is missing");
2361 return;
2362 }
2363
2364 // Record the compute pass task.
2365 ComputeTask::DispatchGroupList groups;
2366 groups.push_back(builder.finalize());
2367 recorder->priv().add(ComputeTask::Make(std::move(groups)));
2368
2369 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2370 auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2371
2372 // Submit the work and wait for it to complete.
2373 std::unique_ptr<Recording> recording = recorder->snap();
2374 if (!recording) {
2375 ERRORF(reporter, "Failed to make recording");
2376 return;
2377 }
2378
2379 InsertRecordingInfo insertInfo;
2380 insertInfo.fRecording = recording.get();
2381 context->insertRecording(insertInfo);
2382 testContext->syncedSubmit(context);
2383
2384 // Verify the contents of the output buffer.
2385 constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2386 const uint32_t result = static_cast<const uint32_t*>(
2387 map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2388 REPORTER_ASSERT(reporter,
2389 result == kExpectedCount,
2390 "expected '%u', found '%u'",
2391 kExpectedCount,
2392 result);
2393 }
2394
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_WorkgroupBufferDescMetal,reporter,context,testContext)2395 DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_WorkgroupBufferDescMetal,
2396 reporter,
2397 context,
2398 testContext) {
2399 std::unique_ptr<Recorder> recorder = context->makeRecorder();
2400
2401 constexpr uint32_t kWorkgroupCount = 32;
2402 constexpr uint32_t kWorkgroupSize = 1024;
2403
2404 class TestComputeStep : public ComputeStep {
2405 public:
2406 TestComputeStep() : ComputeStep(
2407 /*name=*/"TestAtomicOperationsMetal",
2408 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2409 /*resources=*/{
2410 {
2411 /*type=*/ResourceType::kStorageBuffer,
2412 /*flow=*/DataFlow::kShared,
2413 /*policy=*/ResourcePolicy::kMapped,
2414 /*slot=*/0,
2415 }
2416 },
2417 /*workgroupBuffers=*/{
2418 {
2419 /*size=*/sizeof(uint32_t),
2420 /*index=*/0u,
2421 }
2422 },
2423 /*baseFlags=*/Flags::kSupportsNativeShader) {}
2424 ~TestComputeStep() override = default;
2425
2426 // This is the same MSL kernel as in Compute_NativeShaderSourceMetal, except `localCounter`
2427 // is an entry-point parameter instead of a local variable. This forces the workgroup
2428 // binding to be encoded explicitly in the command encoder.
2429 NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
2430 SkASSERT(format == NativeShaderFormat::kMSL);
2431 static constexpr std::string_view kSource = R"(
2432 #include <metal_stdlib>
2433
2434 using namespace metal;
2435
2436 kernel void atomicCount(uint3 localId [[thread_position_in_threadgroup]],
2437 device atomic_uint& globalCounter [[buffer(0)]],
2438 threadgroup atomic_uint& localCounter [[threadgroup(0)]]) {
2439 // Initialize the local counter.
2440 if (localId.x == 0u) {
2441 atomic_store_explicit(&localCounter, 0u, memory_order_relaxed);
2442 }
2443
2444 // Synchronize the threads in the workgroup so they all see the initial value.
2445 threadgroup_barrier(mem_flags::mem_threadgroup);
2446
2447 // All threads increment the counter.
2448 atomic_fetch_add_explicit(&localCounter, 1u, memory_order_relaxed);
2449
2450 // Synchronize the threads again to ensure they have all executed the increment
2451 // and the following load reads the same value across all threads in the
2452 // workgroup.
2453 threadgroup_barrier(mem_flags::mem_threadgroup);
2454
2455 // Add the workgroup-only tally to the global counter.
2456 if (localId.x == 0u) {
2457 uint tally = atomic_load_explicit(&localCounter, memory_order_relaxed);
2458 atomic_fetch_add_explicit(&globalCounter, tally, memory_order_relaxed);
2459 }
2460 }
2461 )";
2462 return {kSource, "atomicCount"};
2463 }
2464
2465 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2466 SkASSERT(index == 0);
2467 SkASSERT(r.fSlot == 0);
2468 SkASSERT(r.fFlow == DataFlow::kShared);
2469 return sizeof(uint32_t);
2470 }
2471
2472 WorkgroupSize calculateGlobalDispatchSize() const override {
2473 return WorkgroupSize(kWorkgroupCount, 1, 1);
2474 }
2475
2476 void prepareStorageBuffer(int resourceIndex,
2477 const ResourceDesc& r,
2478 void* buffer,
2479 size_t bufferSize) const override {
2480 SkASSERT(resourceIndex == 0);
2481 *static_cast<uint32_t*>(buffer) = 0;
2482 }
2483 } step;
2484
2485 DispatchGroup::Builder builder(recorder.get());
2486 builder.appendStep(&step);
2487
2488 BindBufferInfo info = builder.getSharedBufferResource(0);
2489 if (!info) {
2490 ERRORF(reporter, "shared resource at slot 0 is missing");
2491 return;
2492 }
2493
2494 // Record the compute pass task.
2495 ComputeTask::DispatchGroupList groups;
2496 groups.push_back(builder.finalize());
2497 recorder->priv().add(ComputeTask::Make(std::move(groups)));
2498
2499 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2500 auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2501
2502 // Submit the work and wait for it to complete.
2503 std::unique_ptr<Recording> recording = recorder->snap();
2504 if (!recording) {
2505 ERRORF(reporter, "Failed to make recording");
2506 return;
2507 }
2508
2509 InsertRecordingInfo insertInfo;
2510 insertInfo.fRecording = recording.get();
2511 context->insertRecording(insertInfo);
2512 testContext->syncedSubmit(context);
2513
2514 // Verify the contents of the output buffer.
2515 constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2516 const uint32_t result = static_cast<const uint32_t*>(
2517 map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2518 REPORTER_ASSERT(reporter,
2519 result == kExpectedCount,
2520 "expected '%u', found '%u'",
2521 kExpectedCount,
2522 result);
2523 }
2524
DEF_GRAPHITE_TEST_FOR_DAWN_CONTEXT(Compute_NativeShaderSourceWGSL,reporter,context,testContext)2525 DEF_GRAPHITE_TEST_FOR_DAWN_CONTEXT(Compute_NativeShaderSourceWGSL, reporter, context, testContext) {
2526 // This fails on Dawn D3D11, b/315834710
2527 if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
2528 return;
2529 }
2530
2531 std::unique_ptr<Recorder> recorder = context->makeRecorder();
2532
2533 constexpr uint32_t kWorkgroupCount = 32;
2534 // The WebGPU compat default workgroup size limit is 128.
2535 constexpr uint32_t kWorkgroupSize = 128;
2536
2537 class TestComputeStep : public ComputeStep {
2538 public:
2539 TestComputeStep() : ComputeStep(
2540 /*name=*/"TestAtomicOperationsWGSL",
2541 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2542 /*resources=*/{
2543 {
2544 /*type=*/ResourceType::kStorageBuffer,
2545 /*flow=*/DataFlow::kShared,
2546 /*policy=*/ResourcePolicy::kMapped,
2547 /*slot=*/0,
2548 }
2549 },
2550 /*workgroupBuffers=*/{},
2551 /*baseFlags=*/Flags::kSupportsNativeShader) {}
2552 ~TestComputeStep() override = default;
2553
2554 NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
2555 SkASSERT(format == NativeShaderFormat::kWGSL);
2556 static constexpr std::string_view kSource = R"(
2557 @group(0) @binding(0) var<storage, read_write> globalCounter: atomic<u32>;
2558
2559 var<workgroup> localCounter: atomic<u32>;
2560
2561 @compute @workgroup_size(128)
2562 fn atomicCount(@builtin(local_invocation_id) localId: vec3u) {
2563 // Initialize the local counter.
2564 if localId.x == 0u {
2565 atomicStore(&localCounter, 0u);
2566 }
2567
2568 // Synchronize the threads in the workgroup so they all see the initial value.
2569 workgroupBarrier();
2570
2571 // All threads increment the counter.
2572 atomicAdd(&localCounter, 1u);
2573
2574 // Synchronize the threads again to ensure they have all executed the increment
2575 // and the following load reads the same value across all threads in the
2576 // workgroup.
2577 workgroupBarrier();
2578
2579 // Add the workgroup-only tally to the global counter.
2580 if localId.x == 0u {
2581 let tally = atomicLoad(&localCounter);
2582 atomicAdd(&globalCounter, tally);
2583 }
2584 }
2585 )";
2586 return {kSource, "atomicCount"};
2587 }
2588
2589 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2590 SkASSERT(index == 0);
2591 SkASSERT(r.fSlot == 0);
2592 SkASSERT(r.fFlow == DataFlow::kShared);
2593 return sizeof(uint32_t);
2594 }
2595
2596 WorkgroupSize calculateGlobalDispatchSize() const override {
2597 return WorkgroupSize(kWorkgroupCount, 1, 1);
2598 }
2599
2600 void prepareStorageBuffer(int resourceIndex,
2601 const ResourceDesc& r,
2602 void* buffer,
2603 size_t bufferSize) const override {
2604 SkASSERT(resourceIndex == 0);
2605 *static_cast<uint32_t*>(buffer) = 0;
2606 }
2607 } step;
2608
2609 DispatchGroup::Builder builder(recorder.get());
2610 builder.appendStep(&step);
2611
2612 BindBufferInfo info = builder.getSharedBufferResource(0);
2613 if (!info) {
2614 ERRORF(reporter, "shared resource at slot 0 is missing");
2615 return;
2616 }
2617
2618 // Record the compute pass task.
2619 ComputeTask::DispatchGroupList groups;
2620 groups.push_back(builder.finalize());
2621 recorder->priv().add(ComputeTask::Make(std::move(groups)));
2622
2623 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2624 auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2625
2626 // Submit the work and wait for it to complete.
2627 std::unique_ptr<Recording> recording = recorder->snap();
2628 if (!recording) {
2629 ERRORF(reporter, "Failed to make recording");
2630 return;
2631 }
2632
2633 InsertRecordingInfo insertInfo;
2634 insertInfo.fRecording = recording.get();
2635 context->insertRecording(insertInfo);
2636 testContext->syncedSubmit(context);
2637
2638 // Verify the contents of the output buffer.
2639 constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2640 const uint32_t result = static_cast<const uint32_t*>(
2641 map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2642 REPORTER_ASSERT(reporter,
2643 result == kExpectedCount,
2644 "expected '%u', found '%u'",
2645 kExpectedCount,
2646 result);
2647 }
2648